taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit c195fd437b76d00ee780cef49903266165f001a7
parent d58b121de641c0122652bc3d6096a9d0e1048391
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Tue, 28 Apr 2015 16:41:46 -0400

Support polylines with <5 points

Diffstat:
Mdata.py | 14+++++++++-----
Mmake_valid.py | 7++++---
Mmodel.py | 5+++--
Mtransformers.py | 30++++++++++++++++++++++--------
4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/data.py b/data.py @@ -15,6 +15,12 @@ else: client_ids = {int(x): y+1 for y, x in enumerate(open(DATA_PATH+"/client_ids.txt"))} +def get_client_id(n): + if n in client_ids: + return client_ids[n] + else: + return 0 + porto_center = numpy.array([[ -8.61612, 41.1573]], dtype=theano.config.floatX) data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype=theano.config.floatX)) @@ -127,7 +133,7 @@ class TaxiData(Dataset): taxi_columns = [ ("trip_id", lambda x: x), ("call_type", CallType.from_data), - ("origin_call", lambda x: 0 if x == '' or x == 'NA' else client_ids[int(x)]), + ("origin_call", lambda x: 0 if x == '' or x == 'NA' else get_client_id(int(x))), ("origin_stand", lambda x: 0 if x == '' or x == 'NA' else int(x)), ("taxi_id", int), ("timestamp", int), @@ -144,13 +150,11 @@ taxi_columns_valid = taxi_columns + [ train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)] valid_files=["%s/split/valid.csv" % (DATA_PATH,)] +test_file="%s/test.csv" % (DATA_PATH,) train_data=TaxiData(train_files, taxi_columns) - valid_data = TaxiData(valid_files, taxi_columns_valid) - -# for the moment - will be changed later -test_data = valid_data +test_data = TaxiData(test_file, taxi_columns, has_header=True) def train_it(): return DataIterator(DataStream(train_data)) diff --git a/make_valid.py b/make_valid.py @@ -14,8 +14,8 @@ with open("valid-full.csv") as f: def make_valid_item(l): polyline = ast.literal_eval(l[-1]) last = polyline[-1] - cut_idx = random.randrange(len(polyline)-5) - cut = polyline[:cut_idx+6] + cut_idx = random.randrange(len(polyline)+1) + cut = polyline[:cut_idx] return l[:-1] + [ cut.__str__(), last[0], @@ -23,7 +23,7 @@ def make_valid_item(l): 15 * (len(polyline)-1), ] -vlines = map(make_valid_item, filter(lambda l: (len(ast.literal_eval(l[-1])) > 5), vlines)) +vlines = map(make_valid_item, filter(lambda l: (len(ast.literal_eval(l[-1])) > 0), vlines)) with open("valid.csv", "w") as f: wr = csv.writer(f) @@ -32,5 +32,6 @@ with open("valid.csv", "w") as f: with open("valid-solution.csv", "w") as f: wr = csv.writer(f) + wr.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) for r in vlines: wr.writerow([r[0], r[-2], r[-3]]) diff --git a/model.py b/model.py @@ -140,8 +140,8 @@ def main(): extensions=[DataStreamMonitoring([cost, hcost], valid_stream, prefix='valid', - every_n_batches=1), - Printing(every_n_batches=1), + every_n_batches=1000), + Printing(every_n_batches=1000), # Dump('taxi_model', every_n_batches=100), # LoadFromDump('taxi_model'), ] @@ -163,6 +163,7 @@ def main(): outfile = open("test-output.csv", "w") outcsv = csv.writer(outfile) + outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): dest = out['outputs'] for i, trip in enumerate(out['trip_id']): diff --git a/transformers.py b/transformers.py @@ -3,6 +3,17 @@ import numpy import theano import random +def at_least_k(k, pl, pad_at_begin): + if len(pl) == 0: + pl = [[ -8.61612, 41.1573]] + if len(pl) < k: + if pad_at_begin: + pl = [pl[0]] * (k - len(pl)) + pl + else: + pl = pl + [pl[-1]] * (k - len(pl)) + return pl + + class Select(Transformer): def __init__(self, data_stream, sources): super(Select, self).__init__(data_stream) @@ -18,31 +29,34 @@ class Select(Transformer): def add_first_k(k, stream): id_polyline=stream.sources.index('polyline') def first_k(x): - return (numpy.array(x[id_polyline][:k], dtype=theano.config.floatX).flatten(),) - stream = Filter(stream, lambda x: len(x[id_polyline])>=k) + pl = at_least_k(k, x[id_polyline], False) + return (numpy.array(pl[:k], dtype=theano.config.floatX).flatten(),) stream = Mapping(stream, first_k, ('first_k',)) return stream def add_random_k(k, stream): id_polyline=stream.sources.index('polyline') def random_k(x): - loc = random.randrange(len(x[id_polyline])-k+1) - return (numpy.array(x[id_polyline][loc:loc+k], dtype=theano.config.floatX).flatten(),) - stream = Filter(stream, lambda x: len(x[id_polyline])>=k) + pl = at_least_k(k, x[id_polyline], True) + loc = random.randrange(len(pl)-k+1) + return (numpy.array(pl[loc:loc+k], dtype=theano.config.floatX).flatten(),) stream = Mapping(stream, random_k, ('last_k',)) return stream def add_last_k(k, stream): id_polyline=stream.sources.index('polyline') def last_k(x): - return (numpy.array(x[id_polyline][-k:], dtype=theano.config.floatX).flatten(),) - stream = Filter(stream, lambda x: len(x[id_polyline])>=k) + pl = at_least_k(k, x[id_polyline], True) + return (numpy.array(pl[-k:], dtype=theano.config.floatX).flatten(),) stream = Mapping(stream, last_k, ('last_k',)) return stream def add_destination(stream): id_polyline=stream.sources.index('polyline') - return Mapping(stream, lambda x: (numpy.array(x[id_polyline][-1], dtype=theano.config.floatX),), ('destination',)) + return Mapping(stream, + lambda x: + (numpy.array(at_least_k(1, x[id_polyline], True)[-1], dtype=theano.config.floatX),), + ('destination',)) def concat_destination_xy(stream): id_dx=stream.sources.index('destination_x')