commit 80d3ea67a845484d119cb88f0a0412f981ab344c
parent f9a31bd246e3c4736d3f532b566b7437eba6b4de
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Mon, 4 May 2015 16:43:48 -0400
Mew data analysis tool: clustering of arrival points.
Diffstat:
4 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/config/model_0.py b/config/model_0.py
@@ -12,6 +12,6 @@ dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed
dim_hidden = [200, 100]
dim_output = 2
-learning_rate = 0.002
-momentum = 0.9
+learning_rate = 0.0001
+momentum = 0.99
batch_size = 32
diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import numpy
+import cPickle
+import scipy.misc
+
+from sklearn.cluster import MeanShift, estimate_bandwidth
+from sklearn.datasets.samples_generator import make_blobs
+from itertools import cycle
+
+print "Reading arrival point list"
+with open("arrivals.pkl") as f:
+ pts = cPickle.load(f)
+
+print "Doing clustering"
+bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000)
+print bw
+bw = 0.001
+
+ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
+ms.fit(pts)
+cluster_centers = ms.cluster_centers_
+
+print "Clusters shape: ", cluster_centers.shape
+
+with open("arrival-cluters.pkl", "w") as f:
+ cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL)
+
diff --git a/data_analysis/destmaps.py b/data_analysis/destmaps.py
@@ -4,7 +4,7 @@ import cPickle
import scipy.misc
print "Loading data..."
-with open("train_normal.pkl") as f: normal = cPickle.load(f)
+with open("train.pkl") as f: normal = cPickle.load(f)
print "Extracting x and y"
# xes = [c[0] for l in normal for c in l[-1]]
@@ -12,21 +12,21 @@ print "Extracting x and y"
xes = [l[-1][-1][0] for l in normal if len(l[-1]) > 0]
yes = [l[-1][-1][1] for l in normal if len(l[-1]) > 0]
-xrg = [-8.75, -8.55]
-yrg = [41.05, 41.25]
+xrg = [-8.80, -8.50]
+yrg = [41.00, 41.30]
-print "Doing 1d x histogram"
-plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist_dest.pdf")
-print "Doing 1d y histogram"
-plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist_dest.pdf")
+#print "Doing 1d x histogram"
+#plt.clf(); plt.hist(xes, bins=2000, range=xrg); plt.savefig("xhist_dest.pdf")
+#print "Doing 1d y histogram"
+#plt.clf(); plt.hist(yes, bins=2000, range=yrg); plt.savefig("yhist_dest.pdf")
print "Doing 2d histogram"
-hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg])
+hist, xx, yy = numpy.histogram2d(xes, yes, bins=4000, range=[xrg, yrg])
# import ipdb; ipdb.set_trace()
print "Imshow"
-plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest.png", dpi=600)
+plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest_x.png", dpi=600)
print "Imsave"
-scipy.misc.imsave("xymap_dest_2.png", numpy.log(hist + 1))
+scipy.misc.imsave("xymap_dest_2_x.png", numpy.log(hist + 1))
diff --git a/model.py b/model.py
@@ -53,7 +53,7 @@ def setup_train_stream():
load_in_memory=True)
train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid))
train = transformers.filter_out_trips(data.valid_trips, train)
- train = transformers.TaxiGenerateSplits(train)
+ train = transformers.TaxiGenerateSplits(train, max_splits=100)
train = transformers.add_first_k(config.n_begin_end_pts, train)
train = transformers.add_last_k(config.n_begin_end_pts, train)
train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k_latitude',