commit defab74395f2ddb2641bba6ab8d18bdedde7a334
parent ca40e5c81d385e1422cebe40e009d7e93b95bfbb
Author: Alex Auvolat <alex.auvolat@ens.fr>
Date: Wed, 29 Jul 2015 12:06:00 -0400
p-value caluculation script
Diffstat:
3 files changed, 89 insertions(+), 1 deletion(-)
diff --git a/data/make_reference_output.py b/data/make_reference_output.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+import csv
+import os
+
+from fuel.iterator import DataIterator
+from fuel.schemes import SequentialExampleScheme
+from fuel.streams import DataStream
+
+from data.hdf5 import TaxiDataset
+import data
+
+dest_outfile = open(os.path.join(data.path, 'test_answer.csv'), 'w')
+dest_outcsv = csv.writer(dest_outfile)
+dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
+
+dataset = TaxiDataset('test', 'tvt.hdf5',
+ sources=('trip_id', 'longitude', 'latitude',
+ 'destination_longitude', 'destination_latitude'))
+it = DataIterator(DataStream(dataset), iter(xrange(dataset.num_examples)), as_dict=True)
+
+for v in it:
+ # print v
+ dest_outcsv.writerow([v['trip_id'], v['destination_latitude'],
+ v['destination_longitude']])
+
+dest_outfile.close()
+
diff --git a/data/transformers.py b/data/transformers.py
@@ -187,7 +187,7 @@ class _window_helper(object):
if x.shape[0] < self.window_len:
x = numpy.concatenate(
- [x, numpy.full((self.window_len - x.shape[0],), x[-1])])
+ [numpy.full((self.window_len - x.shape[0],), x[0]), x])
y = [x[i: i+x.shape[0]-self.window_len+1][:, None]
for i in range(self.window_len)]
diff --git a/pvalue.py b/pvalue.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import os
+import sys
+
+import math
+import numpy
+
+import data
+
+# Haversine distance calculation
+# --------- -------- -----------
+
+rearth = 6371.
+deg2rad = 3.141592653589793 / 180.
+
+def hdist(a, b):
+ lat1 = a[:, 0] * deg2rad
+ lon1 = a[:, 1] * deg2rad
+ lat2 = b[:, 0] * deg2rad
+ lon2 = b[:, 1] * deg2rad
+
+ dlat = abs(lat1-lat2)
+ dlon = abs(lon1-lon2)
+
+ al = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2)
+ d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al))
+
+ hd = 2. * rearth * d
+
+ return hd
+
+
+# Read the inputs
+# ---- --- ------
+
+def readcsv(f):
+ return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3]
+
+answer = readcsv(os.path.join(data.path, 'test_answer.csv'))
+
+tables = [readcsv(f) for f in sys.argv if '.csv' in f]
+etables = [hdist(t, answer) for t in tables]
+
+# Calculate p-values
+# --------- --------
+
+pvalue = numpy.zeros((len(tables), len(tables)))
+
+for i, a in enumerate(etables):
+ for j, b in enumerate(etables):
+ if i == j:
+ continue
+ d = b - a
+ var = (numpy.mean((a - numpy.mean(a))**2)
+ + numpy.mean((b - numpy.mean(b))**2)) / 2.
+ pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var)))
+ pvalue[i, j] = pv
+
+print pvalue