taxi

Winning entry to the Kaggle taxi competition
git clone https://esimon.eu/repos/taxi.git
Log | Files | Refs | README

commit 6a0b47a2fc7c4e800f14212ae81dbd56de17fa94
parent 676af1086b141a7803626b040e7da03526b95406
Author: AdeB <adbrebs@gmail.com>
Date:   Sat, 25 Apr 2015 10:09:01 -0400

Data analysis updated for the new Dataset class. Coordinates are saved in a light numpy array for fast/light retrieval.

Diffstat:
M.gitignore | 2++
Dalex/plots.py | 29-----------------------------
Adata_analysis/maps.py | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adata_analysis/maps_old.py | 29+++++++++++++++++++++++++++++
4 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,3 +1,5 @@ +.idea/* + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/alex/plots.py b/alex/plots.py @@ -1,29 +0,0 @@ -import matplotlib.pyplot as plt -import numpy -import cPickle -import scipy - -print "Loading data..." -with open("train_normal.pkl") as f: normal = cPickle.load(f) - -print "Extracting x and y" -xes = [c[0] for l in normal for c in l[-1]] -yes = [c[1] for l in normal for c in l[-1]] - -xrg = [-8.75, -8.55] -yrg = [41.05, 41.25] - -print "Doing 1d histogram" -#plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist.pdf") -#plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist.pdf") - -print "Doing 2d histogram" -#plt.clf(); plt.hist2d(xes, yes, bins=500, range=[xrg, yrg]); plt.savefig("xymap.pdf") - -hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg]) - -import ipdb; ipdb.set_trace() - -plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap.pdf") - -scipy.misc.imsave("xymap.png", numpy.log(hist)) diff --git a/data_analysis/maps.py b/data_analysis/maps.py @@ -0,0 +1,55 @@ +import cPickle +import scipy +import numpy as np +import matplotlib.pyplot as plt + +import data + + +def compute_number_coordinates(): + train_it = data.train_it() + + # Count the number of coordinates + n_coordinates = 0 + for ride in train_it: + n_coordinates += len(ride[-1]) + print n_coordinates + + return n_coordinates + + +def extract_coordinates(n_coordinates=None): + """Extract coordinates from the dataset and store them in a numpy array""" + + if n_coordinates is None: + n_coordinates = compute_number_coordinates() + + coordinates = np.zeros((n_coordinates, 2), dtype="float32") + train_it = data.train_it() + + c = 0 + for ride in train_it: + for point in ride[-1]: + coordinates[c] = point + c += 1 + + cPickle.dump(coordinates, open(data.DATA_PATH + "/coordinates_array.pkl", "wb")) + + +def draw_map(coordinates, xrg, yrg): + + hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg]) + + plt.imshow(np.log(hist)) + plt.savefig(data.DATA_PATH + "/analysis/xyhmap.pdf") + + scipy.misc.imsave(data.DATA_PATH + "/analysis/xymap.png", np.log(hist)) + + +if __name__ == "__main__": + # extract_coordinates(n_coordinates=83360928) + + coordinates = cPickle.load(open(data.DATA_PATH + "/coordinates_array.pkl", "rb")) + xrg = [-8.75, -8.55] + yrg = [41.05, 41.25] + draw_map(coordinates, xrg, yrg) diff --git a/data_analysis/maps_old.py b/data_analysis/maps_old.py @@ -0,0 +1,29 @@ +import matplotlib.pyplot as plt +import numpy +import cPickle +import scipy + +print "Loading data..." +with open("../train_normal.pkl") as f: normal = cPickle.load(f) + +print "Extracting x and y" +xes = [c[0] for l in normal for c in l[-1]] +yes = [c[1] for l in normal for c in l[-1]] + +xrg = [-8.75, -8.55] +yrg = [41.05, 41.25] + +print "Doing 1d histogram" +#plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist.pdf") +#plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist.pdf") + +print "Doing 2d histogram" +#plt.clf(); plt.hist2d(xes, yes, bins=500, range=[xrg, yrg]); plt.savefig("xymap.pdf") + +hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg]) + +import ipdb; ipdb.set_trace() + +plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap.pdf") + +scipy.misc.imsave("xymap.png", numpy.log(hist))