taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit d58b121de641c0122652bc3d6096a9d0e1048391
parent 902a8dcb40b3da9492093edd5bda356240f29eb0
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Tue, 28 Apr 2015 15:57:35 -0400

Add function for applying model

Diffstat:
Aapply_model.py | 43+++++++++++++++++++++++++++++++++++++++++++
Mdata.py | 12+++++++++---
Mhdist.py | 14++++++++++----
Mmake_valid.py | 6+++++-
Mmodel.py | 40+++++++++++++++++++++++++++++++---------
5 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/apply_model.py b/apply_model.py @@ -0,0 +1,43 @@ +import theano + +from blocks.graph import ComputationGraph + +class Apply(object): + def __init__(self, outputs, return_vars, stream): + if not isinstance(outputs, list): + outputs = [outputs] + if not isinstance(return_vars, list): + return_vars = [return_vars] + + self.outputs = outputs + self.return_vars = return_vars + self.stream = stream + + cg = ComputationGraph(self.outputs) + self.input_names = [i.name for i in cg.inputs] + self.f = theano.function(inputs=cg.inputs, outputs=self.outputs) + + def __iter__(self): + self.iterator = self.stream.get_epoch_iterator(as_dict=True) + while True: + try: + batch = next(self.iterator) + except StopIteration: + return + + inputs = [batch[n] for n in self.input_names] + outputs = self.f(*inputs) + + def find_retvar(name): + for idx, ov in enumerate(self.outputs): + if ov.name == name: + return outputs[idx] + + if name in batch: + return batch[name] + + raise ValueError('Variable ' + name + ' neither in outputs or in batch variables.') + + yield {name: find_retvar(name) for name in self.return_vars} + + diff --git a/data.py b/data.py @@ -6,6 +6,7 @@ from enum import Enum from fuel.datasets import Dataset from fuel.streams import DataStream from fuel.iterator import DataIterator +import theano if socket.gethostname() == "adeb.laptop": DATA_PATH = "/Users/adeb/data/taxi" @@ -14,8 +15,8 @@ else: client_ids = {int(x): y+1 for y, x in enumerate(open(DATA_PATH+"/client_ids.txt"))} -porto_center = numpy.array([[ -8.61612, 41.1573]], dtype='float32') -data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype='float32')) +porto_center = numpy.array([[ -8.61612, 41.1573]], dtype=theano.config.floatX) +data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype=theano.config.floatX)) class CallType(Enum): CENTRAL = 0 @@ -143,8 +144,13 @@ taxi_columns_valid = taxi_columns + [ train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)] valid_files=["%s/split/valid.csv" % (DATA_PATH,)] + train_data=TaxiData(train_files, taxi_columns) -valid_data=TaxiData(valid_files, taxi_columns_valid) + +valid_data = TaxiData(valid_files, taxi_columns_valid) + +# for the moment - will be changed later +test_data = valid_data def train_it(): return DataIterator(DataStream(train_data)) diff --git a/hdist.py b/hdist.py @@ -1,10 +1,16 @@ from theano import tensor +import theano import numpy +def const(v): + if theano.config.floatX == 'float32': + return numpy.float32(v) + else: + return numpy.float64(v) def hdist(a, b): - rearth = numpy.float32(6371) - deg2rad = numpy.float32(3.14159265358979 / 180) + rearth = const(6371) + deg2rad = const(3.141592653589793 / 180) lat1 = a[:, 1] * deg2rad lon1 = a[:, 0] * deg2rad @@ -15,9 +21,9 @@ def hdist(a, b): dlon = abs(lon1-lon2) al = tensor.sin(dlat/2)**2 + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2) - d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(numpy.float32(1)-al)) + d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al)) - hd = 2 * rearth * d + hd = const(2) * rearth * d return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd) diff --git a/make_valid.py b/make_valid.py @@ -13,7 +13,6 @@ with open("valid-full.csv") as f: def make_valid_item(l): polyline = ast.literal_eval(l[-1]) - print len(polyline) last = polyline[-1] cut_idx = random.randrange(len(polyline)-5) cut = polyline[:cut_idx+6] @@ -30,3 +29,8 @@ with open("valid.csv", "w") as f: wr = csv.writer(f) for r in vlines: wr.writerow(r) + +with open("valid-solution.csv", "w") as f: + wr = csv.writer(f) + for r in vlines: + wr.writerow([r[0], r[-2], r[-3]]) diff --git a/model.py b/model.py @@ -2,6 +2,8 @@ import logging import os from argparse import ArgumentParser +import csv + import numpy import theano @@ -31,6 +33,7 @@ from blocks.extensions.monitoring import DataStreamMonitoring import data import transformers import hdist +import apply_model n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday n_dom = 31 @@ -43,7 +46,9 @@ n_begin_end_pts = 5 # how many points we consider at the beginning and end o n_end_pts = 5 dim_embed = 50 -dim_hidden = 200 +dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed +dim_hidden = [200] +dim_output = 2 learning_rate = 0.002 momentum = 0.9 @@ -68,16 +73,15 @@ def main(): # Define the model client_embed_table = LookupTable(length=n_clients+1, dim=dim_embed, name='client_lookup') stand_embed_table = LookupTable(length=n_stands+1, dim=dim_embed, name='stand_lookup') - hidden_layer = MLP(activations=[Rectifier()], - dims=[n_begin_end_pts * 2 * 2 + dim_embed + dim_embed, dim_hidden]) - output_layer = Linear(input_dim=dim_hidden, output_dim=2) + hidden_layer = MLP(activations=[Rectifier() for _ in dim_hidden], + dims=[dim_input] + dim_hidden) + output_layer = Linear(input_dim=dim_hidden[-1], output_dim=dim_output) # Create the Theano variables client_embed = client_embed_table.apply(x_client).flatten(ndim=2) stand_embed = stand_embed_table.apply(x_stand).flatten(ndim=2) - inputs = tensor.concatenate([x_firstk, x_lastk, - client_embed.zeros_like(), stand_embed.zeros_like()], + inputs = tensor.concatenate([x_firstk, x_lastk, client_embed, stand_embed], axis=1) # inputs = theano.printing.Print("inputs")(inputs) hidden = hidden_layer.apply(inputs) @@ -86,6 +90,7 @@ def main(): # Normalize & Center outputs = data.data_std * outputs + data.porto_center + outputs.name = 'outputs' # Calculate the cost cost = (outputs - y).norm(2, axis=1).mean() @@ -121,7 +126,7 @@ def main(): valid = transformers.add_last_k(n_begin_end_pts, valid) valid = transformers.concat_destination_xy(valid) valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination')) - valid_stream = Batch(valid, iteration_scheme=ConstantScheme(batch_size)) + valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000)) # Training @@ -135,8 +140,8 @@ def main(): extensions=[DataStreamMonitoring([cost, hcost], valid_stream, prefix='valid', - every_n_batches=1000), - Printing(every_n_batches=1000), + every_n_batches=1), + Printing(every_n_batches=1), # Dump('taxi_model', every_n_batches=100), # LoadFromDump('taxi_model'), ] @@ -148,6 +153,23 @@ def main(): extensions=extensions) main_loop.run() + # Produce an output on the test data + test = data.test_data + test = DataStream(test) + test = transformers.add_first_k(n_begin_end_pts, test) + test = transformers.add_last_k(n_begin_end_pts, test) + test = transformers.Select(test, ('trip_id', 'origin_stand', 'origin_call', 'first_k', 'last_k')) + test_stream = Batch(test, iteration_scheme=ConstantScheme(1000)) + + outfile = open("test-output.csv", "w") + outcsv = csv.writer(outfile) + for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): + dest = out['outputs'] + for i, trip in enumerate(out['trip_id']): + outcsv.writerow([trip, repr(dest[i, 1]), repr(dest[i, 0])]) + outfile.close() + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) main()