commit c195fd437b76d00ee780cef49903266165f001a7
parent d58b121de641c0122652bc3d6096a9d0e1048391
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Tue, 28 Apr 2015 16:41:46 -0400
Support polylines with <5 points
Diffstat:
4 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/data.py b/data.py
@@ -15,6 +15,12 @@ else:
client_ids = {int(x): y+1 for y, x in enumerate(open(DATA_PATH+"/client_ids.txt"))}
+def get_client_id(n):
+ if n in client_ids:
+ return client_ids[n]
+ else:
+ return 0
+
porto_center = numpy.array([[ -8.61612, 41.1573]], dtype=theano.config.floatX)
data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype=theano.config.floatX))
@@ -127,7 +133,7 @@ class TaxiData(Dataset):
taxi_columns = [
("trip_id", lambda x: x),
("call_type", CallType.from_data),
- ("origin_call", lambda x: 0 if x == '' or x == 'NA' else client_ids[int(x)]),
+ ("origin_call", lambda x: 0 if x == '' or x == 'NA' else get_client_id(int(x))),
("origin_stand", lambda x: 0 if x == '' or x == 'NA' else int(x)),
("taxi_id", int),
("timestamp", int),
@@ -144,13 +150,11 @@ taxi_columns_valid = taxi_columns + [
train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)]
valid_files=["%s/split/valid.csv" % (DATA_PATH,)]
+test_file="%s/test.csv" % (DATA_PATH,)
train_data=TaxiData(train_files, taxi_columns)
-
valid_data = TaxiData(valid_files, taxi_columns_valid)
-
-# for the moment - will be changed later
-test_data = valid_data
+test_data = TaxiData(test_file, taxi_columns, has_header=True)
def train_it():
return DataIterator(DataStream(train_data))
diff --git a/make_valid.py b/make_valid.py
@@ -14,8 +14,8 @@ with open("valid-full.csv") as f:
def make_valid_item(l):
polyline = ast.literal_eval(l[-1])
last = polyline[-1]
- cut_idx = random.randrange(len(polyline)-5)
- cut = polyline[:cut_idx+6]
+ cut_idx = random.randrange(len(polyline)+1)
+ cut = polyline[:cut_idx]
return l[:-1] + [
cut.__str__(),
last[0],
@@ -23,7 +23,7 @@ def make_valid_item(l):
15 * (len(polyline)-1),
]
-vlines = map(make_valid_item, filter(lambda l: (len(ast.literal_eval(l[-1])) > 5), vlines))
+vlines = map(make_valid_item, filter(lambda l: (len(ast.literal_eval(l[-1])) > 0), vlines))
with open("valid.csv", "w") as f:
wr = csv.writer(f)
@@ -32,5 +32,6 @@ with open("valid.csv", "w") as f:
with open("valid-solution.csv", "w") as f:
wr = csv.writer(f)
+ wr.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
for r in vlines:
wr.writerow([r[0], r[-2], r[-3]])
diff --git a/model.py b/model.py
@@ -140,8 +140,8 @@ def main():
extensions=[DataStreamMonitoring([cost, hcost], valid_stream,
prefix='valid',
- every_n_batches=1),
- Printing(every_n_batches=1),
+ every_n_batches=1000),
+ Printing(every_n_batches=1000),
# Dump('taxi_model', every_n_batches=100),
# LoadFromDump('taxi_model'),
]
@@ -163,6 +163,7 @@ def main():
outfile = open("test-output.csv", "w")
outcsv = csv.writer(outfile)
+ outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
dest = out['outputs']
for i, trip in enumerate(out['trip_id']):
diff --git a/transformers.py b/transformers.py
@@ -3,6 +3,17 @@ import numpy
import theano
import random
+def at_least_k(k, pl, pad_at_begin):
+ if len(pl) == 0:
+ pl = [[ -8.61612, 41.1573]]
+ if len(pl) < k:
+ if pad_at_begin:
+ pl = [pl[0]] * (k - len(pl)) + pl
+ else:
+ pl = pl + [pl[-1]] * (k - len(pl))
+ return pl
+
+
class Select(Transformer):
def __init__(self, data_stream, sources):
super(Select, self).__init__(data_stream)
@@ -18,31 +29,34 @@ class Select(Transformer):
def add_first_k(k, stream):
id_polyline=stream.sources.index('polyline')
def first_k(x):
- return (numpy.array(x[id_polyline][:k], dtype=theano.config.floatX).flatten(),)
- stream = Filter(stream, lambda x: len(x[id_polyline])>=k)
+ pl = at_least_k(k, x[id_polyline], False)
+ return (numpy.array(pl[:k], dtype=theano.config.floatX).flatten(),)
stream = Mapping(stream, first_k, ('first_k',))
return stream
def add_random_k(k, stream):
id_polyline=stream.sources.index('polyline')
def random_k(x):
- loc = random.randrange(len(x[id_polyline])-k+1)
- return (numpy.array(x[id_polyline][loc:loc+k], dtype=theano.config.floatX).flatten(),)
- stream = Filter(stream, lambda x: len(x[id_polyline])>=k)
+ pl = at_least_k(k, x[id_polyline], True)
+ loc = random.randrange(len(pl)-k+1)
+ return (numpy.array(pl[loc:loc+k], dtype=theano.config.floatX).flatten(),)
stream = Mapping(stream, random_k, ('last_k',))
return stream
def add_last_k(k, stream):
id_polyline=stream.sources.index('polyline')
def last_k(x):
- return (numpy.array(x[id_polyline][-k:], dtype=theano.config.floatX).flatten(),)
- stream = Filter(stream, lambda x: len(x[id_polyline])>=k)
+ pl = at_least_k(k, x[id_polyline], True)
+ return (numpy.array(pl[-k:], dtype=theano.config.floatX).flatten(),)
stream = Mapping(stream, last_k, ('last_k',))
return stream
def add_destination(stream):
id_polyline=stream.sources.index('polyline')
- return Mapping(stream, lambda x: (numpy.array(x[id_polyline][-1], dtype=theano.config.floatX),), ('destination',))
+ return Mapping(stream,
+ lambda x:
+ (numpy.array(at_least_k(1, x[id_polyline], True)[-1], dtype=theano.config.floatX),),
+ ('destination',))
def concat_destination_xy(stream):
id_dx=stream.sources.index('destination_x')