commit d58b121de641c0122652bc3d6096a9d0e1048391
parent 902a8dcb40b3da9492093edd5bda356240f29eb0
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Tue, 28 Apr 2015 15:57:35 -0400
Add function for applying model
Diffstat:
5 files changed, 98 insertions(+), 17 deletions(-)
diff --git a/apply_model.py b/apply_model.py
@@ -0,0 +1,43 @@
+import theano
+
+from blocks.graph import ComputationGraph
+
+class Apply(object):
+ def __init__(self, outputs, return_vars, stream):
+ if not isinstance(outputs, list):
+ outputs = [outputs]
+ if not isinstance(return_vars, list):
+ return_vars = [return_vars]
+
+ self.outputs = outputs
+ self.return_vars = return_vars
+ self.stream = stream
+
+ cg = ComputationGraph(self.outputs)
+ self.input_names = [i.name for i in cg.inputs]
+ self.f = theano.function(inputs=cg.inputs, outputs=self.outputs)
+
+ def __iter__(self):
+ self.iterator = self.stream.get_epoch_iterator(as_dict=True)
+ while True:
+ try:
+ batch = next(self.iterator)
+ except StopIteration:
+ return
+
+ inputs = [batch[n] for n in self.input_names]
+ outputs = self.f(*inputs)
+
+ def find_retvar(name):
+ for idx, ov in enumerate(self.outputs):
+ if ov.name == name:
+ return outputs[idx]
+
+ if name in batch:
+ return batch[name]
+
+ raise ValueError('Variable ' + name + ' neither in outputs or in batch variables.')
+
+ yield {name: find_retvar(name) for name in self.return_vars}
+
+
diff --git a/data.py b/data.py
@@ -6,6 +6,7 @@ from enum import Enum
from fuel.datasets import Dataset
from fuel.streams import DataStream
from fuel.iterator import DataIterator
+import theano
if socket.gethostname() == "adeb.laptop":
DATA_PATH = "/Users/adeb/data/taxi"
@@ -14,8 +15,8 @@ else:
client_ids = {int(x): y+1 for y, x in enumerate(open(DATA_PATH+"/client_ids.txt"))}
-porto_center = numpy.array([[ -8.61612, 41.1573]], dtype='float32')
-data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype='float32'))
+porto_center = numpy.array([[ -8.61612, 41.1573]], dtype=theano.config.floatX)
+data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype=theano.config.floatX))
class CallType(Enum):
CENTRAL = 0
@@ -143,8 +144,13 @@ taxi_columns_valid = taxi_columns + [
train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)]
valid_files=["%s/split/valid.csv" % (DATA_PATH,)]
+
train_data=TaxiData(train_files, taxi_columns)
-valid_data=TaxiData(valid_files, taxi_columns_valid)
+
+valid_data = TaxiData(valid_files, taxi_columns_valid)
+
+# for the moment - will be changed later
+test_data = valid_data
def train_it():
return DataIterator(DataStream(train_data))
diff --git a/hdist.py b/hdist.py
@@ -1,10 +1,16 @@
from theano import tensor
+import theano
import numpy
+def const(v):
+ if theano.config.floatX == 'float32':
+ return numpy.float32(v)
+ else:
+ return numpy.float64(v)
def hdist(a, b):
- rearth = numpy.float32(6371)
- deg2rad = numpy.float32(3.14159265358979 / 180)
+ rearth = const(6371)
+ deg2rad = const(3.141592653589793 / 180)
lat1 = a[:, 1] * deg2rad
lon1 = a[:, 0] * deg2rad
@@ -15,9 +21,9 @@ def hdist(a, b):
dlon = abs(lon1-lon2)
al = tensor.sin(dlat/2)**2 + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2)
- d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(numpy.float32(1)-al))
+ d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al))
- hd = 2 * rearth * d
+ hd = const(2) * rearth * d
return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd)
diff --git a/make_valid.py b/make_valid.py
@@ -13,7 +13,6 @@ with open("valid-full.csv") as f:
def make_valid_item(l):
polyline = ast.literal_eval(l[-1])
- print len(polyline)
last = polyline[-1]
cut_idx = random.randrange(len(polyline)-5)
cut = polyline[:cut_idx+6]
@@ -30,3 +29,8 @@ with open("valid.csv", "w") as f:
wr = csv.writer(f)
for r in vlines:
wr.writerow(r)
+
+with open("valid-solution.csv", "w") as f:
+ wr = csv.writer(f)
+ for r in vlines:
+ wr.writerow([r[0], r[-2], r[-3]])
diff --git a/model.py b/model.py
@@ -2,6 +2,8 @@ import logging
import os
from argparse import ArgumentParser
+import csv
+
import numpy
import theano
@@ -31,6 +33,7 @@ from blocks.extensions.monitoring import DataStreamMonitoring
import data
import transformers
import hdist
+import apply_model
n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday
n_dom = 31
@@ -43,7 +46,9 @@ n_begin_end_pts = 5 # how many points we consider at the beginning and end o
n_end_pts = 5
dim_embed = 50
-dim_hidden = 200
+dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed
+dim_hidden = [200]
+dim_output = 2
learning_rate = 0.002
momentum = 0.9
@@ -68,16 +73,15 @@ def main():
# Define the model
client_embed_table = LookupTable(length=n_clients+1, dim=dim_embed, name='client_lookup')
stand_embed_table = LookupTable(length=n_stands+1, dim=dim_embed, name='stand_lookup')
- hidden_layer = MLP(activations=[Rectifier()],
- dims=[n_begin_end_pts * 2 * 2 + dim_embed + dim_embed, dim_hidden])
- output_layer = Linear(input_dim=dim_hidden, output_dim=2)
+ hidden_layer = MLP(activations=[Rectifier() for _ in dim_hidden],
+ dims=[dim_input] + dim_hidden)
+ output_layer = Linear(input_dim=dim_hidden[-1], output_dim=dim_output)
# Create the Theano variables
client_embed = client_embed_table.apply(x_client).flatten(ndim=2)
stand_embed = stand_embed_table.apply(x_stand).flatten(ndim=2)
- inputs = tensor.concatenate([x_firstk, x_lastk,
- client_embed.zeros_like(), stand_embed.zeros_like()],
+ inputs = tensor.concatenate([x_firstk, x_lastk, client_embed, stand_embed],
axis=1)
# inputs = theano.printing.Print("inputs")(inputs)
hidden = hidden_layer.apply(inputs)
@@ -86,6 +90,7 @@ def main():
# Normalize & Center
outputs = data.data_std * outputs + data.porto_center
+ outputs.name = 'outputs'
# Calculate the cost
cost = (outputs - y).norm(2, axis=1).mean()
@@ -121,7 +126,7 @@ def main():
valid = transformers.add_last_k(n_begin_end_pts, valid)
valid = transformers.concat_destination_xy(valid)
valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination'))
- valid_stream = Batch(valid, iteration_scheme=ConstantScheme(batch_size))
+ valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000))
# Training
@@ -135,8 +140,8 @@ def main():
extensions=[DataStreamMonitoring([cost, hcost], valid_stream,
prefix='valid',
- every_n_batches=1000),
- Printing(every_n_batches=1000),
+ every_n_batches=1),
+ Printing(every_n_batches=1),
# Dump('taxi_model', every_n_batches=100),
# LoadFromDump('taxi_model'),
]
@@ -148,6 +153,23 @@ def main():
extensions=extensions)
main_loop.run()
+ # Produce an output on the test data
+ test = data.test_data
+ test = DataStream(test)
+ test = transformers.add_first_k(n_begin_end_pts, test)
+ test = transformers.add_last_k(n_begin_end_pts, test)
+ test = transformers.Select(test, ('trip_id', 'origin_stand', 'origin_call', 'first_k', 'last_k'))
+ test_stream = Batch(test, iteration_scheme=ConstantScheme(1000))
+
+ outfile = open("test-output.csv", "w")
+ outcsv = csv.writer(outfile)
+ for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
+ dest = out['outputs']
+ for i, trip in enumerate(out['trip_id']):
+ outcsv.writerow([trip, repr(dest[i, 1]), repr(dest[i, 0])])
+ outfile.close()
+
+
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()