commit ab1076e00d6a92120e46d4a0085911b4425a0d60
parent 5b496677ea1db59a6718e5c9b2958177c76cb25f
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Tue, 5 May 2015 11:36:59 -0400
Add date/time transformer and new model that uses it
Diffstat:
7 files changed, 95 insertions(+), 94 deletions(-)
diff --git a/config/simple_mlp_2_cs.py b/config/simple_mlp_2_cs.py
@@ -2,10 +2,6 @@ import model.simple_mlp as model
import data
-n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday
-n_dom = 31
-n_hour = 24
-
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_2_noembed.py b/config/simple_mlp_2_noembed.py
@@ -2,10 +2,6 @@ import model.simple_mlp as model
import data
-n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday
-n_dom = 31
-n_hour = 24
-
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_0_cs.py b/config/simple_mlp_tgtcls_0_cs.py
@@ -4,10 +4,6 @@ import data
import model.simple_mlp_tgtcls as model
-n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday
-n_dom = 31
-n_hour = 24
-
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_1_cs.py b/config/simple_mlp_tgtcls_1_cs.py
@@ -4,10 +4,6 @@ import data
import model.simple_mlp_tgtcls as model
-n_dow = 7 # number of division for dayofweek/dayofmonth/hourofday
-n_dom = 31
-n_hour = 24
-
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_1_cswdt.py b/config/simple_mlp_tgtcls_1_cswdt.py
@@ -0,0 +1,28 @@
+import cPickle
+
+import data
+
+import model.simple_mlp_tgtcls as model
+
+n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+
+dim_embeddings = [
+ ('origin_call', data.n_train_clients+1, 10),
+ ('origin_stand', data.n_stands+1, 10),
+ ('week_of_year', 53, 10),
+ ('day_of_week', 7, 10),
+ ('qhour_of_day', 24 * 4, 10)
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500]
+dim_output = tgtcls.shape[0]
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/train.py b/train.py
@@ -49,11 +49,13 @@ def setup_train_stream(req_vars):
subset=slice(0, data.dataset_size),
load_in_memory=True)
train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid))
- train = transformers.filter_out_trips(data.valid_trips, train)
+
+ train = transformers.TaxiExcludeTrips(data.valid_trips, train)
train = transformers.TaxiGenerateSplits(train, max_splits=100)
- train = transformers.add_first_k(config.n_begin_end_pts, train)
- train = transformers.add_last_k(config.n_begin_end_pts, train)
+ train = transformers.TaxiAddDateTime(train)
+ train = transformers.TaxiAddFirstK(config.n_begin_end_pts, train)
+ train = transformers.TaxiAddLastK(config.n_begin_end_pts, train)
train = transformers.Select(train, tuple(req_vars))
train_stream = Batch(train, iteration_scheme=ConstantScheme(config.batch_size))
@@ -63,8 +65,9 @@ def setup_train_stream(req_vars):
def setup_valid_stream(req_vars):
valid = DataStream(data.valid_data)
- valid = transformers.add_first_k(config.n_begin_end_pts, valid)
- valid = transformers.add_last_k(config.n_begin_end_pts, valid)
+ valid = transformers.TaxiAddDateTime(valid)
+ valid = transformers.TaxiAddFirstK(config.n_begin_end_pts, valid)
+ valid = transformers.TaxiAddLastK(config.n_begin_end_pts, valid)
valid = transformers.Select(valid, tuple(req_vars))
valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000))
@@ -74,8 +77,9 @@ def setup_valid_stream(req_vars):
def setup_test_stream(req_vars):
test = DataStream(data.test_data)
- test = transformers.add_first_k(config.n_begin_end_pts, test)
- test = transformers.add_last_k(config.n_begin_end_pts, test)
+ test = transformers.TaxiAddDateTime(test)
+ test = transformers.TaxiAddFirstK(config.n_begin_end_pts, test)
+ test = transformers.TaxiAddLastK(config.n_begin_end_pts, test)
test = transformers.Select(test, tuple(req_vars))
test_stream = Batch(test, iteration_scheme=ConstantScheme(1000))
diff --git a/transformers.py b/transformers.py
@@ -4,6 +4,8 @@ import theano
import random
import data
+import datetime
+
def at_least_k(k, v, pad_at_begin, is_longitude):
if len(v) == 0:
v = numpy.array([data.porto_center[1 if is_longitude else 0]], dtype=theano.config.floatX)
@@ -63,78 +65,61 @@ class TaxiGenerateSplits(Transformer):
return tuple(r + [dlat, dlon])
-
-class first_k(object):
- def __init__(self, k, id_latitude, id_longitude):
- self.k = k
- self.id_latitude = id_latitude
- self.id_longitude = id_longitude
- def __call__(self, data):
- return (numpy.array(at_least_k(self.k, data[self.id_latitude], False, False)[:self.k],
- dtype=theano.config.floatX),
- numpy.array(at_least_k(self.k, data[self.id_longitude], False, True)[:self.k],
- dtype=theano.config.floatX))
-def add_first_k(k, stream):
- id_latitude = stream.sources.index('latitude')
- id_longitude = stream.sources.index('longitude')
- return Mapping(stream, first_k(k, id_latitude, id_longitude), ('first_k_latitude', 'first_k_longitude'))
-
-class random_k(object):
- def __init__(self, k, id_latitude, id_longitude):
+class TaxiAddFirstK(Transformer):
+ def __init__(self, k, stream):
+ super(TaxiAddFirstK, self).__init__(stream)
+ self.sources = stream.sources + ('first_k_latitude', 'first_k_longitude')
+ self.id_latitude = stream.sources.index('latitude')
+ self.id_longitude = stream.sources.index('longitude')
self.k = k
- self.id_latitude = id_latitude
- self.id_longitude = id_longitude
- def __call__(self, x):
- lat = at_least_k(self.k, x[self.id_latitude], True, False)
- lon = at_least_k(self.k, x[self.id_longitude], True, True)
- loc = random.randrange(len(lat)-self.k+1)
- return (numpy.array(lat[loc:loc+self.k], dtype=theano.config.floatX),
- numpy.array(lon[loc:loc+self.k], dtype=theano.config.floatX))
-def add_random_k(k, stream):
- id_latitude = stream.sources.index('latitude')
- id_longitude = stream.sources.index('longitude')
- return Mapping(stream, random_k(k, id_latitude, id_longitude), ('last_k_latitude', 'last_k_longitude'))
-
-class last_k(object):
- def __init__(self, k, id_latitude, id_longitude):
+ def get_data(self, request=None):
+ if request is not None: raise ValueError
+ data = next(self.child_epoch_iterator)
+ first_k = (numpy.array(at_least_k(self.k, data[self.id_latitude], False, False)[:self.k],
+ dtype=theano.config.floatX),
+ numpy.array(at_least_k(self.k, data[self.id_longitude], False, True)[:self.k],
+ dtype=theano.config.floatX))
+ return data + first_k
+
+class TaxiAddLastK(Transformer):
+ def __init__(self, k, stream):
+ super(TaxiAddLastK, self).__init__(stream)
+ self.sources = stream.sources + ('last_k_latitude', 'last_k_longitude')
+ self.id_latitude = stream.sources.index('latitude')
+ self.id_longitude = stream.sources.index('longitude')
self.k = k
- self.id_latitude = id_latitude
- self.id_longitude = id_longitude
- def __call__(self, data):
- return (numpy.array(at_least_k(self.k, data[self.id_latitude], True, False)[-self.k:],
- dtype=theano.config.floatX),
- numpy.array(at_least_k(self.k, data[self.id_longitude], True, True)[-self.k:],
- dtype=theano.config.floatX))
-def add_last_k(k, stream):
- id_latitude = stream.sources.index('latitude')
- id_longitude = stream.sources.index('longitude')
- return Mapping(stream, last_k(k, id_latitude, id_longitude), ('last_k_latitude', 'last_k_longitude'))
-
-class destination(object):
- def __init__(self, id_latitude, id_longitude):
- self.id_latitude = id_latitude
- self.id_longitude = id_longitude
- def __call__(self, data):
- return (numpy.array(at_least_k(1, data[self.id_latitude], True, False)[-1],
+ def get_data(self, request=None):
+ if request is not None: raise ValueError
+ data = next(self.child_epoch_iterator)
+ last_k = (numpy.array(at_least_k(self.k, data[self.id_latitude], True, False)[-self.k:],
dtype=theano.config.floatX),
- numpy.array(at_least_k(1, data[self.id_longitude], True, True)[-1],
- dtype=theano.config.floatX))
-def add_destination(stream):
- id_latitude = stream.sources.index('latitude')
- id_longitude = stream.sources.index('longitude')
- return Mapping(stream, destination(id_latitude, id_longitude), ('destination_latitude', 'destination_longitude'))
-
-
-class trip_filter(object):
- def __init__(self, id_trip_id, exclude):
- self.id_trip_id = id_trip_id
- self.exclude = exclude
- def __call__(self, data):
- if data[self.id_trip_id] in self.exclude:
- return False
- else:
- return True
-def filter_out_trips(exclude_trips, stream):
- id_trip_id = stream.sources.index('trip_id')
- return Filter(stream, trip_filter(id_trip_id, exclude_trips))
+ numpy.array(at_least_k(self.k, data[self.id_longitude], True, True)[-self.k:],
+ dtype=theano.config.floatX))
+ return data + last_k
+
+class TaxiAddDateTime(Transformer):
+ def __init__(self, stream):
+ super(TaxiAddDateTime, self).__init__(stream)
+ self.sources = stream.sources + ('week_of_year', 'day_of_week', 'qhour_of_day')
+ self.id_timestamp = stream.sources.index('timestamp')
+ def get_data(self, request=None):
+ if request is not None: raise ValueError
+ data = next(self.child_epoch_iterator)
+ ts = data[self.id_timestamp]
+ date = datetime.datetime.utcfromtimestamp(ts)
+ info = (date.isocalendar()[1] - 1, date.weekday(), date.hour * 4 + date.minute / 15)
+ return data + info
+
+class TaxiExcludeTrips(Transformer):
+ def __init__(self, exclude_list, stream):
+ super(TaxiExcludeTrips, self).__init__(stream)
+ self.id_trip_id = stream.sources.index('trip_id')
+ self.exclude = {v: True for v in exclude_list}
+ def get_data(self, request=None):
+ if request is not None: raise ValueError
+ while True:
+ data = next(self.child_epoch_iterator)
+ if not data[self.id_trip_id] in self.exclude: break
+ return data
+