commit 2a20bc827a8c1c9b6e74ef4e1234788207be45b8
parent d3dbbe68642320b9a225b1d3515f0181916df8ad
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Sat, 25 Jul 2015 17:31:17 -0400
Add batch sorting
Diffstat:
3 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/config/memory_network_bidir_momentum.py b/config/memory_network_bidir_momentum.py
@@ -51,8 +51,8 @@ batch_sort_size = 20
max_splits = 100
num_cuts = 1000
-train_candidate_size = 1000
-valid_candidate_size = 1000
-test_candidate_size = 1000
+train_candidate_size = 300
+valid_candidate_size = 300
+test_candidate_size = 300
step_rule = Momentum(learning_rate=0.01, momentum=0.9)
diff --git a/model/memory_network.py b/model/memory_network.py
@@ -215,7 +215,7 @@ class StreamRecurrent(StreamBase):
]
self.candidate_inputs = self.prefix_inputs
- def candidate_stream(self, n_candidates):
+ def candidate_stream(self, n_candidates, sortmap=True):
candidate_stream = DataStream(self.train_dataset,
iteration_scheme=ShuffledExampleScheme(self.train_dataset.num_examples))
if not data.tvt:
@@ -226,8 +226,14 @@ class StreamRecurrent(StreamBase):
if not data.tvt:
candidate_stream = transformers.add_destination(candidate_stream)
- candidate_stream = Batch(candidate_stream,
- iteration_scheme=ConstantScheme(n_candidates))
+ if sortmap:
+ candidate_stream = transformers.balanced_batch(candidate_stream,
+ key='latitude',
+ batch_size=n_candidates,
+ batch_sort_size=self.config.batch_sort_size)
+ else:
+ candidate_stream = Batch(candidate_stream,
+ iteration_scheme=ConstantScheme(n_candidates))
candidate_stream = Padding(candidate_stream,
mask_sources=['latitude', 'longitude'])
@@ -247,9 +253,9 @@ class StreamRecurrent(StreamBase):
prefix_stream = transformers.taxi_add_datetime(prefix_stream)
prefix_stream = transformers.balanced_batch(prefix_stream,
- key='latitude',
- batch_size=self.config.batch_size,
- batch_sort_size=self.config.batch_sort_size)
+ key='latitude',
+ batch_size=self.config.batch_size,
+ batch_sort_size=self.config.batch_sort_size)
prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude'])
@@ -271,8 +277,11 @@ class StreamRecurrent(StreamBase):
prefix_stream = transformers.taxi_add_datetime(prefix_stream)
- prefix_stream = Batch(prefix_stream,
- iteration_scheme=ConstantScheme(self.config.batch_size))
+ prefix_stream = transformers.balanced_batch(prefix_stream,
+ key='latitude',
+ batch_size=self.config.batch_size,
+ batch_sort_size=self.config.batch_sort_size)
+
prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude'])
candidate_stream = self.candidate_stream(self.config.valid_candidate_size)
@@ -298,7 +307,7 @@ class StreamRecurrent(StreamBase):
iteration_scheme=ConstantScheme(self.config.batch_size))
prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude'])
- candidate_stream = self.candidate_stream(self.config.test_candidate_size)
+ candidate_stream = self.candidate_stream(self.config.test_candidate_size, False)
sources = prefix_stream.sources + tuple('candidate_%s' % k for k in candidate_stream.sources)
stream = Merge((prefix_stream, candidate_stream), sources)
diff --git a/model/stream.py b/model/stream.py
@@ -33,17 +33,13 @@ class StreamRec(object):
stream = transformers.TaxiExcludeEmptyTrips(stream)
stream = transformers.taxi_add_datetime(stream)
-
stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask')))
stream = transformers.balanced_batch(stream, key='latitude',
batch_size=self.config.batch_size,
batch_sort_size=self.config.batch_sort_size)
-
stream = Padding(stream, mask_sources=['latitude', 'longitude'])
-
stream = transformers.Select(stream, req_vars)
-
stream = MultiProcessing(stream)
return stream
@@ -54,9 +50,13 @@ class StreamRec(object):
stream = transformers.taxi_add_datetime(stream)
stream = transformers.Select(stream, tuple(v for v in req_vars if not v.endswith('_mask')))
- stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size))
+ stream = transformers.balanced_batch(stream, key='latitude',
+ batch_size=self.config.batch_size,
+ batch_sort_size=self.config.batch_sort_size)
stream = Padding(stream, mask_sources=['latitude', 'longitude'])
stream = transformers.Select(stream, req_vars)
+ stream = MultiProcessing(stream)
+
return stream
def test(self, req_vars):