taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 39e549f05e568e4153381f025b3a0f256e9a7b7a
parent df50f103c1167f54a3ec04c1380fc95e4a023428
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Fri, 22 May 2015 10:00:15 -0400

Make indexing faster by indexing only one column and querying a range

Diffstat:
Mdata/cut.py | 6++++--
Mdata/make_time_index.py | 2+-
Mmodel/dest_simple_mlp_tgtcls.py | 2+-
Mmodel/mlp.py | 7+++++--
4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/data/cut.py b/data/cut.py @@ -24,8 +24,10 @@ class TaxiTimeCutScheme(IterationScheme): with sqlite3.connect(self.dbfile) as db: c = db.cursor() for cut in cuts: - l = l + [i for (i,) in - c.execute('SELECT trip FROM trip_times WHERE begin <= ? AND end >= ?', (cut, cut))] + part = [i for (i,) in + c.execute('SELECT trip FROM trip_times WHERE begin >= ? AND begin <= ? AND end >= ?', + (cut - 40000, cut, cut))] + l = l + part return iter_(l) diff --git a/data/make_time_index.py b/data/make_time_index.py @@ -39,7 +39,7 @@ def make_valid(outpath): c.executemany('INSERT INTO trip_times(trip, begin, end) VALUES(?, ?, ?)', times) timedb.commit() print "Creating index..." - c.execute('''CREATE INDEX trip_time_index ON trip_times (begin, end)''') + c.execute('''CREATE INDEX trip_begin_index ON trip_times (begin)''') if __name__ == '__main__': diff --git a/model/dest_simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py @@ -9,7 +9,7 @@ from model.mlp import FFMLP, Stream class Model(FFMLP): def __init__(self, config, **kwargs): - super(Model, self, output_layer=Softmax).__init__(config, **kwargs) + super(Model, self).__init__(config, output_layer=Softmax, **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') @application(outputs=['destination']) diff --git a/model/mlp.py b/model/mlp.py @@ -1,6 +1,6 @@ from theano import tensor -from fuel.transformers import Batch +from fuel.transformers import Batch, MultiProcessing from fuel.streams import DataStream from fuel.schemes import ConstantScheme, ShuffledExampleScheme from blocks.bricks import application, MLP, Rectifier, Initializable @@ -63,7 +63,10 @@ class Stream(object): stream = transformers.TaxiAddDateTime(stream) stream = transformers.TaxiAddFirstLastLen(self.config.n_begin_end_pts, stream) stream = transformers.Select(stream, tuple(req_vars)) - return Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) + + stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) + + return stream def valid(self, req_vars): stream = TaxiStream(self.config.valid_set, 'valid.hdf5')