commit c61b71b63396648f490d9cb10e31de2bcdba601f
parent 682d2a64915e11eeaac999306c17f9d12f9fb22a
Author: Étienne Simon <esimon@esimon.eu>
Date:   Wed, 30 Apr 2014 15:44:54 +0200
Add meta-model and train/test executables
Diffstat:
| A | config.py |  |  | 25 | +++++++++++++++++++++++++ | 
| M | dataset.py |  |  | 7 | ++++--- | 
| D | main.py |  |  | 32 | -------------------------------- | 
| A | meta_model.py |  |  | 52 | ++++++++++++++++++++++++++++++++++++++++++++++++++++ | 
| M | model.py |  |  | 134 | +++++++++++++++++++++++++++++++++++++------------------------------------------ | 
| A | test.py |  |  | 33 | +++++++++++++++++++++++++++++++++ | 
| A | train.py |  |  | 26 | ++++++++++++++++++++++++++ | 
7 files changed, 203 insertions(+), 106 deletions(-)
diff --git a/config.py b/config.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python2
+
+import numpy
+import json
+from model import *
+from relations import *
+
+def load_config(path):
+    with open(path, 'r') as config_file:
+        config = json.load(config_file)
+        for k, v in config.iteritems():
+            if isinstance(v, basestring) and v.startswith('python:'):
+                config[k] = eval(v[7:])
+    return config
+
+def expand_config(base_config):
+    size = base_config['size']
+    configs = [ base_config.copy() for _ in xrange(size) ]
+    for (config, index) in zip(configs, xrange(size)):
+        if not isinstance(config['model name'], list):
+            config['model name'] = ('{0} {1:0{width}}').format(config['model name'], index, width=len(str(size)))
+        for k, v in config.iteritems():
+            if isinstance(v, list):
+                config[k] = v[index]
+    return configs
diff --git a/dataset.py b/dataset.py
@@ -6,7 +6,8 @@ import numpy
 import theano
 
 class Dataset(object):
-    def __init__(self, prefix):
+    def __init__(self, prefix, rng):
+        self.rng = rng
         log('# Loading dataset "{0}"\n'.format(prefix))
         with open(prefix+'/embeddings', 'r') as file:
             self.embeddings = file.readlines()
@@ -33,7 +34,7 @@ class Dataset(object):
         # Sampling corrupted entities
         def sample_matrix():
             row = range(self.train_size+1)
-            col = numpy.random.randint(0, self.number_embeddings, size=self.train_size)
+            col = self.rng.randint(0, self.number_embeddings, size=self.train_size)
             data = numpy.ones(self.train_size)
             random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX)
             return random_embeddings
@@ -41,7 +42,7 @@ class Dataset(object):
         corrupted_right = sample_matrix()
 
         # Shuffling training set
-        order = numpy.random.permutation(self.train_size)
+        order = self.rng.permutation(self.train_size)
         train_left = self.train_left[order, :]
         train_right = self.train_right[order, :]
         train_relation = self.train_relation[order, :]
diff --git a/main.py b/main.py
@@ -1,32 +0,0 @@
-#!/usr/bin/env python2
-
-from __future__ import print_function
-from utils.log import *
-import sys
-import json
-
-from dataset import *
-from model import *
-from relations import *
-
-if __name__ == '__main__':
-    if len(sys.argv)<3:
-        print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr)
-        sys.exit(1)
-    data = sys.argv[1]
-    config_path = sys.argv[2]
-    model_path = None if len(sys.argv)<4 else sys.argv[3]
-
-    with open(config_path, 'r') as config_file:
-        config = json.load(config_file)
-        for k, v in config.iteritems():
-            if isinstance(v, basestring) and v.startswith('python:'):
-                config[k] = eval(v[7:])
-
-    data = Dataset(data)
-    if model_path is None:
-        model = Model.initialise(config['relations'], data, config, config['model name'])
-    else:
-        model = Model.load(model_path, data, config, config['model name'])
-    model.train()
-    model.test()
diff --git a/meta_model.py b/meta_model.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python2
+
+from utils.log import *
+from config import *
+from model import *
+import numpy
+
+class Meta_model(object):
+    """ Meta-model class. """
+
+    def __init__(self, dataset, config, pathes=None):
+        self.dataset = dataset
+        self.combine_scores = config['scores combinator']
+        configs = expand_config(config)
+        if pathes is None:
+            pathes = [ '{0}/{1}.best'.format(config['best model save path'], config['model name']) for config in configs ]
+        self.models = [ Model(dataset, config, path) for config, path in zip(configs, pathes) ]
+
+    def build_test(self):
+        for model in self.models:
+            model.build_test()
+
+    def left_scoring_function(self, relation, left, right):
+        res = [ model.left_scoring_function(relation, left, right) for model in self.models ]
+        return numpy.transpose(res).reshape(right.shape[0], len(self.models))
+
+    def right_scoring_function(self, relation, left, right):
+        res = [ model.right_scoring_function(relation, left, right) for model in self.models ]
+        return numpy.transpose(res).reshape(left.shape[0], len(self.models))
+
+    def error(self):
+        """ Compute the mean rank, standard deviation and top 10 on a given data. """
+        result = []
+        for (relation, left, right) in self.dataset.iterate('test'):
+            entities = self.dataset.universe
+            raw_left_scores = self.left_scoring_function(relation, left, entities)
+            raw_right_scores = self.right_scoring_function(relation, entities, right)
+            left_scores = self.combine_scores(raw_left_scores)
+            right_scores = self.combine_scores(raw_right_scores)
+            left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[0]) # FIXME Ugly
+            right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[0]) # FIXME Ugly
+            result.extend((left_rank, right_rank))
+        mean = numpy.mean(result)
+        std = numpy.std(result)
+        top10 = numpy.mean(map(lambda x: x<=10, result))
+        return (mean, std, top10)
+
+    def test(self):
+        """ Test the model. """
+        log('# Testing the model')
+        (mean, std, top10) = self.error()
+        log(' mean: {0:<15} std: {1:<15} top10: {2:<15}\n'.format(mean, std, top10))
diff --git a/model.py b/model.py
@@ -27,50 +27,29 @@ class Model(object):
     Training model using SGD with a contrastive criterion.
     """
 
-    @classmethod
-    def initialise(cls, Relations, dataset, config, tag):
+    def __init__(self, dataset, config, filepath=None):
         """ Initialise a model.
 
         Keyword arguments:
-        Relations -- relations class
         dataset -- dataset on which the model will be trained and tested
         config -- config dictionary
-        tag -- name of the embeddings for parameter declaration
+        filepath -- path to the Model file
         """
-        log('# Initialising model "{0}"\n'.format(tag))
 
-        self = cls()
-        self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], tag+'.embeddings')
-        self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], tag+'.relations')
+        log('# Initialising model "{0}"\n'.format(config['model name']))
         self.dataset = dataset
         self.config = config
-        self.tag = tag
-
-        self.build()
-        return self
-
-    @classmethod
-    def load(cls, filepath, dataset, config, tag):
-        """ Load a model from a file.
-
-        Keyword arguments:
-        filepath -- path to the Model file
-        dataset -- dataset on which the model will be trained and tested
-        config -- config dictionary
-        tag -- name of the embeddings for parameter declaration
-        """
-        log('# Loading model from "{0}"\n'.format(filepath))
-
-        self = cls()
-        with open(filepath, 'rb') as file:
-            self.embeddings = cPickle.load(file)
-            self.relations = cPickle.load(file)
-        self.dataset = dataset;
-        self.config = config;
-        self.tag = tag
-
-        self.build()
-        return self
+        self.tag = config['model name']
+
+        if filepath is None:
+            Relations = config['relations']
+            self.embeddings = Embeddings(config['rng'], dataset.number_embeddings, config['dimension'], self.tag+'.embeddings')
+            self.relations = Relations(config['rng'], dataset.number_relations, config['dimension'], self.tag+'.relations')
+        else:
+            log('## Loading model from "{0}"\n'.format(filepath))
+            with open(filepath, 'rb') as file:
+                self.embeddings = cPickle.load(file)
+                self.relations = cPickle.load(file)
 
     def save(self, filepath):
         """ Save the model in a file. """
@@ -78,15 +57,18 @@ class Model(object):
             cPickle.dump(self.embeddings, file, -1)
             cPickle.dump(self.relations, file, -1)
 
-    def build(self):
-        """ Build theano functions. """
-        log('## Compiling Theano graph for model "{0}"\n'.format(self.tag))
-
-        self.parameters = self.relations.parameters + self.embeddings.parameters
-        inputs = tuple(S.csr_matrix() for _ in xrange(5))
-        left_positive, right_positive = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2])
-        left_negative, right_negative = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4])
-        relation = self.relations.lookup(inputs[0])
+    def build_train(self):
+        """ Build theano train functions. """
+        log('## Compiling Theano graph for training model "{0}"\n'.format(self.tag))
+        input_relation = S.csr_matrix("relation")
+        input_left_positive = S.csr_matrix("left positive")
+        input_right_positive = S.csr_matrix("right positive")
+        input_left_negative = S.csr_matrix("left negative")
+        input_right_negative = S.csr_matrix("right negative")
+        inputs = [ input_relation, input_left_positive, input_right_positive, input_left_negative, input_right_negative ]
+        left_positive, right_positive = self.embeddings.embed(input_left_positive), self.embeddings.embed(input_right_positive)
+        left_negative, right_negative = self.embeddings.embed(input_left_negative), self.embeddings.embed(input_right_negative)
+        relation = self.relations.lookup(input_relation)
 
         score_positive = self.config['similarity'](self.relations.transform(left_positive, relation), right_positive)
         score_left_negative = self.config['similarity'](self.relations.transform(left_negative, relation), right_positive)
@@ -100,17 +82,27 @@ class Model(object):
         criterion_right = T.mean(violating_margin_right*score_right)
         criterion = criterion_left + criterion_right
 
-        self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion))
+        self.train_function = theano.function(inputs=inputs, outputs=[criterion], updates=self.updates(criterion))
         self.normalise_function = theano.function(inputs=[], outputs=[], updates=self.embeddings.normalise_updates())
 
+    def build_test(self):
+        """ Build theano test functions. """
+        log('## Compiling Theano graph for testing model "{0}"\n'.format(self.tag))
+        input_relation = S.csr_matrix("relation")
+        input_left = S.csr_matrix("left")
+        input_right = S.csr_matrix("right")
+        inputs = [ input_relation, input_left, input_right ]
+        left, right = self.embeddings.embed(input_left), self.embeddings.embed(input_right)
+        relation = self.relations.lookup(input_relation)
+
         relation = map(lambda r: T.addbroadcast(r, 0), relation)
-        left_broadcasted = T.addbroadcast(left_positive, 0)
-        right_broadcasted = T.addbroadcast(right_positive, 0)
-        left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right_positive)
-        right_score = self.config['similarity'](self.relations.transform(left_positive, relation), right_broadcasted)
+        left_broadcasted = T.addbroadcast(left, 0)
+        right_broadcasted = T.addbroadcast(right, 0)
+        left_score = self.config['similarity'](self.relations.transform(left_broadcasted, relation), right)
+        right_score = self.config['similarity'](self.relations.transform(left, relation), right_broadcasted)
 
-        self.left_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[left_score])
-        self.right_scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[right_score])
+        self.left_scoring_function = theano.function(inputs=inputs, outputs=[left_score])
+        self.right_scoring_function = theano.function(inputs=inputs, outputs=[right_score])
 
     def updates(self, cost):
         """ Compute the updates to perform a SGD step w.r.t. a given cost.
@@ -131,36 +123,36 @@ class Model(object):
         number_epoch = self.config['number of epoch']
 
         for epoch in xrange(number_epoch):
-            if (epoch+1) % validation_frequency == 0:
-                self.validate(epoch+1)
-
             for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size):
                 self.normalise_function()
                 self.train_function(relation, left_positive, right_positive, left_negative, right_negative)
 
-    def error(self, name):
-        """ Compute the mean rank and top 10 on a given data. """
-        count, mean, top10 = 0, 0, 0
+            if (epoch+1) % validation_frequency == 0:
+                self.validate(epoch+1)
+
+    def error(self, name, transform_scores=(lambda x: x)):
+        """ Compute the mean rank, standard deviation and top 10 on a given data. """
+        result = []
         for (relation, left, right) in self.dataset.iterate(name):
-            left_scores, right_scores = None, None
             entities = self.dataset.universe
             left_scores = self.left_scoring_function(relation, left, entities)
             right_scores = self.right_scoring_function(relation, entities, right)
+            left_scores = transform_scores(left_scores)
+            right_scores = transform_scores(right_scores)
             left_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(left_scores)==right.indices[0])[1]) # FIXME Ugly
             right_rank = 1+numpy.asscalar(numpy.where(numpy.argsort(right_scores)==left.indices[0])[1]) # FIXME Ugly
-            count += 2
-            mean += left_rank + right_rank
-            top10 += (left_rank<=10) + (right_rank<=10)
-        mean = float(mean) / count
-        top10 = float(top10) / count
-        return (mean, top10)
+            result.extend((left_rank, right_rank))
+        mean = numpy.mean(result)
+        std = numpy.std(result)
+        top10 = numpy.mean(map(lambda x: x<=10, result))
+        return (mean, std, top10)
 
     def validate(self, epoch):
         """ Validate the model. """
         log('Validation epoch {:<5}'.format(epoch))
-        (valid_mean, valid_top10) = self.error('valid')
-        log(' valid mean: {0:<15} valid top10: {1:<15}'.format(valid_mean, valid_top10))
-        datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_top10)
+        (valid_mean, valid_std, valid_top10) = self.error('valid')
+        log(' valid mean: {0:<15} valid std: {1:<15} valid top10: {2:<15}'.format(valid_mean, valid_std, valid_top10))
+        datalog(self.config['datalog path']+'/'+self.config['model name'], epoch, valid_mean, valid_std, valid_top10)
         if not hasattr(self, 'best_mean') or valid_mean < self.best_mean:
             self.best_mean = valid_mean
             log('(best so far')
@@ -171,14 +163,14 @@ class Model(object):
             log(')')
 
         if self.config['validate on training data']:
-            (train_mean, train_top10) = self.error('train')
-            log(' train mean: {0:<15} train top10: {1:<15}'.format(train_mean, train_top10))
+            (train_mean, train_std, train_top10) = self.error('train')
+            log(' train mean: {0:<15} std: {1:<15} train top10: {2:<15}'.format(train_mean, train_std, train_top10))
         log('\n')
 
     def test(self):
         """ Test the model. """
         log('# Testing the model "{0}"'.format(self.tag))
-        (mean, top10) = self.error('test')
-        log(' mean: {0:<15} top10: {1:<15} (saving...'.format(mean, top10))
+        (mean, std, top10) = self.error('test')
+        log(' mean: {0:<15} std: {1:<15} top10: {2:<15} (saving...'.format(mean, std, top10))
         self.save('{0}/{1}.last'.format(self.config['last model save path'], self.config['model name']))
         log(' done)\n')
diff --git a/test.py b/test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import sys
+
+from dataset import *
+from model import *
+from meta_model import *
+from config import *
+
+if __name__ == '__main__':
+    if len(sys.argv)<3:
+        print('Usage: {0} data config [models]...'.format(sys.argv[0]), file=sys.stderr)
+        sys.exit(1)
+    data = sys.argv[1]
+    config_path = sys.argv[2]
+
+    if len(sys.argv)<4: model_pathes = None
+    elif len(sys.argv)>4: model_pathes = sys.argv[3:]
+    else: model_pathes = sys.argv[3]
+
+    config = load_config(config_path)
+    if not config.get('meta', False) and model_pathes is None:
+        model_pathes = '{0}/{1}.best'.format(config['best model save path'], config['model name'])
+    if not config.get('meta', False) and isinstance(model_pathes, list):
+        print('Error: multiple model specified while running in single mode', file=sys.stderr)
+        sys.exit(1)
+    ModelType = Meta_model if config.get('meta', False) else Model
+
+    data = Dataset(data, config['rng'])
+    model = ModelType(data, config, model_pathes)
+    model.build_test()
+    model.test()
diff --git a/train.py b/train.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import sys
+
+from dataset import *
+from model import *
+from relations import *
+from config import *
+
+if __name__ == '__main__':
+    if len(sys.argv)<3:
+        print('Usage: {0} data config [model]'.format(sys.argv[0]), file=sys.stderr)
+        sys.exit(1)
+    data = sys.argv[1]
+    config_path = sys.argv[2]
+    model_path = None if len(sys.argv)<4 else sys.argv[3]
+
+    config = load_config(config_path)
+    data = Dataset(data, config['rng'])
+    model = Model(data, config, model_path)
+
+    model.build_train()
+    model.build_test()
+    model.train()
+    model.test()