taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 2a20bc827a8c1c9b6e74ef4e1234788207be45b8
parent d3dbbe68642320b9a225b1d3515f0181916df8ad
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Sat, 25 Jul 2015 17:31:17 -0400

Add batch sorting

Diffstat:
Mconfig/memory_network_bidir_momentum.py | 6+++---
Mmodel/memory_network.py | 27++++++++++++++++++---------
Mmodel/stream.py | 10+++++-----
3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/config/memory_network_bidir_momentum.py b/config/memory_network_bidir_momentum.py @@ -51,8 +51,8 @@ batch_sort_size = 20 max_splits = 100 num_cuts = 1000 -train_candidate_size = 1000 -valid_candidate_size = 1000 -test_candidate_size = 1000 +train_candidate_size = 300 +valid_candidate_size = 300 +test_candidate_size = 300 step_rule = Momentum(learning_rate=0.01, momentum=0.9) diff --git a/model/memory_network.py b/model/memory_network.py @@ -215,7 +215,7 @@ class StreamRecurrent(StreamBase): ] self.candidate_inputs = self.prefix_inputs - def candidate_stream(self, n_candidates): + def candidate_stream(self, n_candidates, sortmap=True): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme(self.train_dataset.num_examples)) if not data.tvt: @@ -226,8 +226,14 @@ class StreamRecurrent(StreamBase): if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) - candidate_stream = Batch(candidate_stream, - iteration_scheme=ConstantScheme(n_candidates)) + if sortmap: + candidate_stream = transformers.balanced_batch(candidate_stream, + key='latitude', + batch_size=n_candidates, + batch_sort_size=self.config.batch_sort_size) + else: + candidate_stream = Batch(candidate_stream, + iteration_scheme=ConstantScheme(n_candidates)) candidate_stream = Padding(candidate_stream, mask_sources=['latitude', 'longitude']) @@ -247,9 +253,9 @@ class StreamRecurrent(StreamBase): prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.balanced_batch(prefix_stream, - key='latitude', - batch_size=self.config.batch_size, - batch_sort_size=self.config.batch_sort_size) + key='latitude', + batch_size=self.config.batch_size, + batch_sort_size=self.config.batch_sort_size) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) @@ -271,8 +277,11 @@ class StreamRecurrent(StreamBase): prefix_stream = transformers.taxi_add_datetime(prefix_stream) - prefix_stream = Batch(prefix_stream, - iteration_scheme=ConstantScheme(self.config.batch_size)) + prefix_stream = transformers.balanced_batch(prefix_stream, + key='latitude', + batch_size=self.config.batch_size, + batch_sort_size=self.config.batch_sort_size) + prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream(self.config.valid_candidate_size) @@ -298,7 +307,7 @@ class StreamRecurrent(StreamBase): iteration_scheme=ConstantScheme(self.config.batch_size)) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) - candidate_stream = self.candidate_stream(self.config.test_candidate_size) + candidate_stream = self.candidate_stream(self.config.test_candidate_size, False) sources = prefix_stream.sources + tuple('candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) diff --git a/model/stream.py b/model/stream.py @@ -33,17 +33,13 @@ class StreamRec(object): stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) - stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch(stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) - stream = Padding(stream, mask_sources=['latitude', 'longitude']) - stream = transformers.Select(stream, req_vars) - stream = MultiProcessing(stream) return stream @@ -54,9 +50,13 @@ class StreamRec(object): stream = transformers.taxi_add_datetime(stream) stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask'))) - stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) + stream = transformers.balanced_batch(stream, key='latitude', + batch_size=self.config.batch_size, + batch_sort_size=self.config.batch_sort_size) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) + stream = MultiProcessing(stream) + return stream def test(self, req_vars):