prepare.sh (4214B)
1 #!/usr/bin/env bash 2 3 RESET=`tput sgr0` 4 BOLD="`tput bold`" 5 RED="$RESET`tput setaf 1`$BOLD" 6 GREEN="$RESET`tput setaf 2`" 7 YELLOW="$RESET`tput setaf 3`" 8 BLUE="$RESET`tput setaf 4`$BOLD" 9 10 export PYTHONPATH="$PWD:$PYTHONPATH" 11 12 echo "${YELLOW}This script will prepare the data." 13 echo "${YELLOW}You should run it from inside the repository." 14 echo "${YELLOW}You should set the TAXI_PATH variable to where the data downloaded from kaggle is." 15 echo "${YELLOW}Three data files are needed: ${BOLD}train.csv.zip${YELLOW}, ${BOLD}test.csv.zip${YELLOW} and ${BOLD}metaData_taxistandsID_name_GPSlocation.csv.zip${YELLOW}. They can be found at the following url: ${BOLD}https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/data" 16 if [ ! -e train.py ]; then 17 echo "${RED}train.py not found, you are not inside the taxi repository." 18 exit 1 19 fi 20 21 22 echo -e "\n$BLUE# Checking dependencies" 23 24 python_import(){ 25 echo -n "${YELLOW}$1... $RESET" 26 if ! python2 -c "import $1; print '${GREEN}version', $1.__version__, '${YELLOW}(we used version $2)'"; then 27 echo "${RED}failed, $1 is not installed" 28 exit 1 29 fi 30 } 31 32 python_import h5py 2.5.0 33 python_import theano 0.7.0.dev 34 python_import fuel 0.0.1-ed725a7ff9f3d080ef882d4ae7e4373c4984f35a 35 python_import blocks 0.0.1-1e0aca9171611be4df404129d91a991354e67730 36 python_import sklearn 0.16.1 37 38 39 echo -e "\n$BLUE# Checking data" 40 41 echo "${YELLOW}TAXI_PATH is set to $TAXI_PATH" 42 43 md5_check(){ 44 echo -n "${YELLOW}md5sum $1... $RESET" 45 if [ ! -e "$TAXI_PATH/$1" ]; then 46 echo "${RED}file not found, are you sure you set the TAXI_PATH variable correctly?" 47 exit 1 48 fi 49 if command -v md5 >/dev/null 2>&1; then 50 md5=`md5 "$TAXI_PATH/$1" | sed -e 's/^.* //'` 51 elif command -v md5sum >/dev/null 2>&1; then 52 md5=`md5sum "$TAXI_PATH/$1" | sed -e 's/ .*//'` 53 else 54 echo "${RED} no md5 utility" 55 return 56 fi 57 if [ $md5 = $2 ]; then 58 echo "$GREEN$md5 ok" 59 else 60 echo "$RED$md5 failed" 61 exit 1 62 fi 63 } 64 65 md5_check train.csv.zip 87a1b75adfde321dc163160b495964e8 66 md5_check test.csv.zip 47133bf7349cb80cc668fa56af8ce743 67 md5_check metaData_taxistandsID_name_GPSlocation.csv.zip fecec7286191af868ce8fb208f5c7643 68 69 70 echo -e "\n$BLUE# Extracting data" 71 72 zipextract(){ 73 echo -n "${YELLOW}unziping $1... $RESET" 74 unzip -o "$TAXI_PATH/$1" -d "$TAXI_PATH" 75 echo "${GREEN}ok" 76 } 77 78 zipextract train.csv.zip 79 md5_check train.csv 68cc499ac4937a3079ebf69e69e73971 80 81 zipextract test.csv.zip 82 md5_check test.csv f2ceffde9d98e3c49046c7d998308e71 83 84 zipextract metaData_taxistandsID_name_GPSlocation.csv.zip 85 86 echo -n "${YELLOW}patching error in metadata csv... $RESET" 87 cat "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv" | sed -e 's/41,Nevogilde,41.163066654-8.67598304213/41,Nevogilde,41.163066654,-8.67598304213/' > "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.tmp" 88 mv "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.tmp" "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv" 89 echo "${GREEN}ok" 90 91 md5_check metaData_taxistandsID_name_GPSlocation.csv 724805b0b1385eb3efc02e8bdfe9c1df 92 93 94 echo -e "\n$BLUE# Conversion of training set to HDF5" 95 echo "${YELLOW}This might take some time$RESET" 96 python2 data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5" 97 98 99 echo -e "\n$BLUE# Generation of validation set" 100 echo "${YELLOW}This might take some time$RESET" 101 102 echo -n "${YELLOW}initialization... $RESET" 103 python2 data/init_valid.py 104 echo "${GREEN}ok" 105 106 echo -n "${YELLOW}cutting... $RESET" 107 python2 data/make_valid_cut.py test_times_0 108 echo "${GREEN}ok" 109 110 111 echo -e "\n$BLUE# Generation of destination cluster" 112 echo "${YELLOW}This might take some time$RESET" 113 echo -n "${YELLOW}generating... $RESET" 114 python2 data_analysis/cluster_arrival.py 115 echo "${GREEN}ok" 116 117 118 echo -e "\n$BLUE# Creating output folders" 119 echo -n "${YELLOW}mkdir model_data... $RESET"; mkdir model_data; echo "${GREEN}ok" 120 echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok" 121 122 echo -e "\n$GREEN${BOLD}The data was successfully prepared" 123 echo "${YELLOW}To train the winning model on gpu, you can now run the following command:" 124 echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre"