data_set.cpp (3150B)
1 #include <cstddef> 2 #include <fstream> 3 #include <string> 4 #include <vector> 5 6 #include <boost/make_shared.hpp> 7 #include <boost/shared_ptr.hpp> 8 #include <boost/tokenizer.hpp> 9 #include <nmlp/Matrix.h> 10 #include <nmlp/Tensor.h> 11 12 #include "data_set.hpp" 13 14 Data_set::Data_set(boost::shared_ptr<Tensor> views, std::vector<std::string> const &view_kinds): views(views), view_kinds(view_kinds) {} 15 16 std::size_t Data_set::size() const { 17 // nmlp is const-inconsistent. 18 Data_set * const ncthis=const_cast<Data_set*>(this); 19 return ncthis->views->getMatrix(0)->getNumberOfRows(); 20 } 21 22 std::size_t Data_set::number_of_views() const { 23 return view_kinds.size(); 24 } 25 26 std::string Data_set::kind(std::size_t id) const { 27 return view_kinds[id]; 28 } 29 30 boost::shared_ptr<Matrix> Data_set::get(std::size_t element, std::size_t view) const { 31 // nmlp is const-inconsistent. 32 Data_set * const ncthis=const_cast<Data_set*>(this); 33 boost::shared_ptr<Matrix> mat=ncthis->views->getMatrix(view); 34 boost::shared_ptr<Matrix> ret=boost::make_shared<CPUMatrix>(1, mat->getNumberOfColumns()); 35 for(std::size_t x=0; x<mat->getNumberOfColumns(); ++x) 36 ret->setValue(0, x, mat->getValue(element, x)); 37 return ret; 38 } 39 40 boost::shared_ptr<Matrix> Data_set::get(std::size_t view) const { 41 // nmlp is const-inconsistent. 42 Data_set * const ncthis=const_cast<Data_set*>(this); 43 return ncthis->views->getMatrix(view); 44 } 45 46 Data_set classification_svmfile_to_data_set(std::string const &filepath, std::vector<std::string> kinds){ 47 if(kinds.size()!=2) 48 throw std::runtime_error("Classification SVM files always represent 2 views, but the given kinds vector is not of size 2"); 49 50 std::ifstream file(filepath); 51 if(!file) 52 throw std::runtime_error("Can't open SVM file"); 53 54 std::vector<float> output; 55 std::vector<std::vector<float> > input; 56 57 std::string buf; 58 boost::char_separator<char> separators(" \t:"); 59 std::size_t max_feature=1; 60 while(std::getline(file, buf)){ 61 typedef boost::tokenizer<boost::char_separator<char> > tokenizer; 62 tokenizer tokens(buf, separators); 63 tokenizer::iterator token=tokens.begin(); 64 65 output.push_back(boost::lexical_cast<float>(*token++)); 66 input.push_back(std::vector<float>(max_feature)); 67 68 while(token!=tokens.end()){ 69 std::size_t const k=boost::lexical_cast<std::size_t>(*token++); 70 if(token==tokens.end()) 71 break; 72 float const v=boost::lexical_cast<float>(*token++); 73 74 if(k>max_feature){ 75 max_feature=k; 76 for(std::vector<std::vector<float> >::iterator it=input.begin(); it!=input.end(); ++it) 77 it->resize(max_feature); 78 } 79 80 input.back()[k-1]=v; 81 } 82 } 83 84 boost::shared_ptr<CPUMatrix> output_matrix=boost::make_shared<CPUMatrix>(output.size(), 1); 85 boost::shared_ptr<CPUMatrix> input_matrix=boost::make_shared<CPUMatrix>(input.size(), max_feature); 86 for(std::size_t row=0; row<input.size(); ++row){ 87 output_matrix->setValue(row, 0, output[row]); 88 for(std::size_t column=0; column<max_feature; ++column) 89 input_matrix->setValue(row, column, input[row][column]); 90 } 91 92 boost::shared_ptr<Tensor> data=boost::make_shared<Tensor>(2); 93 data->setMatrix(0, input_matrix); 94 data->setMatrix(1, output_matrix); 95 return Data_set(data, kinds); 96 } 97