p-value caluculation script - taxi - Winning entry to the Kaggle taxi competition

commit defab74395f2ddb2641bba6ab8d18bdedde7a334
parent ca40e5c81d385e1422cebe40e009d7e93b95bfbb
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date:   Wed, 29 Jul 2015 12:06:00 -0400

p-value caluculation script

Diffstat:
A data/make_reference_output.py  | 28 ++++++++++++++++++++++++++++
M data/transformers.py  | 2 +-
A pvalue.py  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 89 insertions(+), 1 deletion(-)
diff --git a/data/make_reference_output.py b/data/make_reference_output.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+import csv
+import os
+
+from fuel.iterator import DataIterator
+from fuel.schemes import SequentialExampleScheme
+from fuel.streams import DataStream
+
+from data.hdf5 import TaxiDataset
+import data
+
+dest_outfile = open(os.path.join(data.path, 'test_answer.csv'), 'w')
+dest_outcsv = csv.writer(dest_outfile)
+dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
+
+dataset = TaxiDataset('test', 'tvt.hdf5',
+                     sources=('trip_id', 'longitude', 'latitude',
+                              'destination_longitude', 'destination_latitude'))
+it = DataIterator(DataStream(dataset), iter(xrange(dataset.num_examples)), as_dict=True)
+
+for v in it:
+    # print v
+    dest_outcsv.writerow([v['trip_id'], v['destination_latitude'],
+                                        v['destination_longitude']])
+
+dest_outfile.close()
+
diff --git a/data/transformers.py b/data/transformers.py
@@ -187,7 +187,7 @@ class _window_helper(object):
 
         if x.shape[0] < self.window_len:
             x = numpy.concatenate(
-                [x, numpy.full((self.window_len - x.shape[0],), x[-1])])
+                [numpy.full((self.window_len - x.shape[0],), x[0]), x])
             
         y = [x[i: i+x.shape[0]-self.window_len+1][:, None]
              for i in range(self.window_len)]
diff --git a/pvalue.py b/pvalue.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import os
+import sys
+
+import math
+import numpy
+
+import data
+
+# Haversine distance calculation
+# --------- -------- -----------
+
+rearth = 6371.
+deg2rad = 3.141592653589793 / 180.
+
+def hdist(a, b):
+    lat1 = a[:, 0] * deg2rad
+    lon1 = a[:, 1] * deg2rad
+    lat2 = b[:, 0] * deg2rad
+    lon2 = b[:, 1] * deg2rad
+
+    dlat = abs(lat1-lat2)
+    dlon = abs(lon1-lon2)
+
+    al = numpy.sin(dlat/2)**2  + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2)
+    d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al))
+
+    hd = 2. * rearth * d
+
+    return hd
+
+
+# Read the inputs
+# ---- --- ------
+
+def readcsv(f):
+    return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3]
+
+answer = readcsv(os.path.join(data.path, 'test_answer.csv'))
+
+tables = [readcsv(f) for f in sys.argv if '.csv' in f]
+etables = [hdist(t, answer) for t in tables]
+
+# Calculate p-values
+# --------- --------
+
+pvalue = numpy.zeros((len(tables), len(tables)))
+
+for i, a in enumerate(etables):
+    for j, b in enumerate(etables):
+        if i == j:
+            continue
+        d = b - a
+        var = (numpy.mean((a - numpy.mean(a))**2)
+                + numpy.mean((b - numpy.mean(b))**2)) / 2.
+        pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var)))
+        pvalue[i, j] = pv
+
+print pvalue

	taxi Winning entry to the Kaggle taxi competition
	git clone https://esimon.eu/repos/taxi.git
	Log \| Files \| Refs \| README

A	data/make_reference_output.py	\|	28	++++++++++++++++++++++++++++
M	data/transformers.py	\|	2	+-
A	pvalue.py	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++