commit 9aa890972b341519afb1339f636d968944f86ecf
parent 194510b0a3c0718ad8137de758e2646d5a4a93e7
Author: Étienne Simon <esimon@esimon.eu>
Date:   Wed, 16 Apr 2014 13:09:36 +0200
Add Model
Diffstat:
9 files changed, 340 insertions(+), 64 deletions(-)
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python2
+
+import scipy
+import numpy
+import sys
+import theano
+
+class Dataset(object):
+    def __init__(self, prefix):
+        print >>sys.stderr, '# Loading dataset "{0}"'.format(prefix)
+        with open(prefix+'/embeddings', 'r') as file:
+            self.embeddings = file.readlines()
+        with open(prefix+'/relations', 'r') as file:
+            self.relations = file.readlines()
+        self.number_embeddings = len(self.embeddings)
+        self.number_relations = len(self.relations)
+        self.load_file(prefix, 'train')
+        self.load_file(prefix, 'valid')
+        self.load_file(prefix, 'test')
+
+    def load_file(self, prefix, name):
+        with open('{0}/{1}'.format(prefix, name), 'r') as file:
+            content = map(lambda line: map(int, line.split('\t')), file.readlines())
+            [left, relation, right] = map(list, zip(*content))
+        N = len(relation)
+        setattr(self, name+'_size', N)
+        setattr(self, name+'_right', scipy.sparse.csr_matrix(([1]*N, right, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX))
+        setattr(self, name+'_relation', scipy.sparse.csr_matrix(([1]*N, relation, range(N+1)), shape=(N, self.number_relations), dtype=theano.config.floatX))
+        setattr(self, name+'_left', scipy.sparse.csr_matrix(([1]*N, left, range(N+1)), shape=(N, self.number_embeddings), dtype=theano.config.floatX))
+
+    def training_minibatch(self, batch_size):
+        # Sampling corrupted entities
+        def sample_matrix():
+            row = range(self.train_size+1)
+            col = numpy.random.randint(0, self.number_embeddings, size=self.train_size)
+            data = numpy.ones(self.train_size)
+            random_embeddings = scipy.sparse.csr_matrix((data, col, row), shape=(self.train_size, self.number_embeddings), dtype=theano.config.floatX)
+            return random_embeddings
+        corrupted_left = sample_matrix()
+        corrupted_right = sample_matrix()
+
+        # Shuffling training set
+        order = numpy.random.permutation(self.train_size)
+        train_left = self.train_left[order, :]
+        train_right = self.train_right[order, :]
+        train_relation = self.train_relation[order, :]
+
+        # Yielding batches
+        ls = numpy.linspace(0, self.train_size, 1+self.train_size/batch_size)
+        for i in xrange(len(ls)-1):
+            left_positive = train_left[ls[i]:ls[i+1]]
+            right_positive = train_right[ls[i]:ls[i+1]]
+            left_negative = corrupted_left[ls[i]:ls[i+1]]
+            right_negative = corrupted_right[ls[i]:ls[i+1]]
+            relation = train_relation[ls[i]:ls[i+1]]
+            yield (relation, left_positive, right_positive, left_negative, right_negative)
+
+    def iterate(self, name, batch_size):
+        def repeat_csr(matrix, size):
+            data = list(matrix.data)*size
+            indices = list(matrix.indices)*size
+            indptr = range(size+1)
+            return scipy.sparse.csr_matrix((data, indices, indptr), shape=(size, matrix.shape[1]), dtype=theano.config.floatX)
+        N = getattr(self, name+'_size')
+        relation = getattr(self, name+'_relation')
+        left = getattr(self, name+'_left')
+        right = getattr(self, name+'_right')
+        for i in xrange(N):
+            yield (repeat_csr(relation[i], batch_size), repeat_csr(left[i], batch_size), right[i])
+
+    def universe_minibatch(self, batch_size):
+        N = len(self.embeddings)
+        entities = scipy.sparse.eye(N, format='csr', dtype=theano.config.floatX)
+        for i in xrange(N/batch_size):
+            yield entities[i*batch_size:(i+1)*batch_size]
diff --git a/embeddings.py b/embeddings.py
@@ -26,10 +26,10 @@ class Embeddings(object):
 
         E_bound = numpy.sqrt(6. / dimension)
         E_values = rng.uniform(low=-E_bound, high=E_bound, size=(number, dimension))
-        E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1))
+        E_values = E_values / numpy.sqrt(numpy.sum(E_values **2, axis=1))[:, numpy.newaxis]
         self.E = theano.shared(name=tag, value=numpy.asarray(E_values, dtype=theano.config.floatX))
 
-        self.params = [E]
+        self.parameters = [self.E]
 
     def embed(self, entities):
         """ Embed given entities.
@@ -39,15 +39,7 @@ class Embeddings(object):
         """
         return S.dot(entities, self.E)
 
-    def L1_norm(self):
-        """ Compute the L1-norm of the embeddings parameter. """
-        return T.sum(T.abs(self.E))
-
-    def sqrL2_norm(self):
-        """ Compute the squared L2-norm of the embeddings parameter. """
-        return T.sum(T.sqr(self.E))
-
-    def sgd_updates(self, cost, learning_rate):
+    def updates(self, cost, learning_rate):
         """ Compute the updates to perform a SGD step w.r.t. a given cost.
 
         Keyword arguments:
diff --git a/main.py b/main.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python2
+
+from dataset import *
+from model import *
+from relations.translations import *
+
+if __name__ == '__main__':
+    hyperparameters = dict()
+    hyperparameters['similarity'] = L1_norm
+    hyperparameters['rng'] = numpy.random
+    hyperparameters['dimension'] = 20
+    hyperparameters['margin'] = 1.
+    hyperparameters['relation_learning_rate'] = 1
+    hyperparameters['embeddings_learning_rate'] = 0.1
+    hyperparameters['train_batch_size'] = 100
+    hyperparameters['test_batch_size'] = 500
+    hyperparameters['validation_frequency'] = 500
+    hyperparameters['number_epoch'] = 10000
+
+    data = Dataset('data/dummy')
+    model = Model.initialise(Translations, data, hyperparameters, 'dummy')
+    model.train()
+    model.test()
diff --git a/model.py b/model.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python2
+
+import time
+
+import sys
+import numpy
+import scipy
+import theano
+import theano.tensor as T
+import theano.sparse as S
+
+from embeddings import *
+
+def L1_norm(l, r):
+    return T.sum(abs(l-r), axis=1)
+
+def L2_norm(l, r):
+    return T.sqrt(T.sum(T.sqr(l-r), axis=1))
+
+class Model(object):
+    """ Model class.
+
+    Training model using SGD with a contrastive criterion.
+    """
+
+    @classmethod
+    def initialise(cls, Relations, dataset, hyperparameters, tag):
+        """ Initialise a model.
+
+        Keyword arguments:
+        Relations -- relations class
+        dataset -- dataset on which the model will be trained and tested
+        hyperparameters -- hyperparameters dictionary
+        tag -- name of the embeddings for parameter declaration
+        """
+
+        print >>sys.stderr, '# Initialising model "{0}"'.format(tag)
+
+        self = cls()
+        self.embeddings = Embeddings(hyperparameters['rng'], dataset.number_embeddings, hyperparameters['dimension'], tag+'.embeddings')
+        self.relations = Relations(hyperparameters['rng'], dataset.number_relations, hyperparameters['dimension'], tag+'.relations')
+        self.dataset = dataset
+        self.hyperparameters = hyperparameters
+        self.tag = tag
+
+        self.build()
+        return self
+
+    @classmethod
+    def load(cls, filepath, dataset, hyperparameters):
+        """ Load a model from a file.
+
+        Keyword arguments:
+        filepath -- path to the Model file
+        dataset -- dataset on which the model will be trained and tested
+        hyperparameters -- hyperparameters dictionary
+        """
+
+        print >>sys.stderr, '# Loading model from "{0}"'.format(filepath)
+
+        self = cls()
+
+        with open(filepath, 'rb') as file:
+            self.embeddings = cPickle.load(file)
+            self.relations = cPickle.load(file)
+        self.dataset = dataset;
+        self.hyperparameters = hyperparameters;
+
+        self.build()
+        return self
+
+    def save(self, filepath):
+        """ Save the model in a file. """
+        with open(filepath, 'wb') as file:
+            cPickle.dump(self.embeddings, file, -1)
+            cPickle.dump(self.relations, file, -1)
+
+    def build(self):
+        """ Build theano functions. """
+        print >>sys.stderr, '## Compiling Theano graph for model "{0}"'.format(self.tag)
+
+        self.parameters = self.relations.parameters + self.embeddings.parameters
+        inputs = tuple(S.csr_matrix() for _ in xrange(5))
+        positive_left, positive_right = self.embeddings.embed(inputs[1]), self.embeddings.embed(inputs[2])
+        negative_left, negative_right = self.embeddings.embed(inputs[3]), self.embeddings.embed(inputs[4])
+        positive_score = self.hyperparameters['similarity'](self.relations.apply(positive_left, inputs[0]), positive_right)
+        negative_score = self.hyperparameters['similarity'](self.relations.apply(negative_left, inputs[0]), negative_right)
+        score = self.hyperparameters['margin'] + positive_score - negative_score
+        violating_margin = score>0
+        criterion = T.mean(violating_margin*score)
+
+        self.train_function = theano.function(inputs=list(inputs), outputs=[criterion], updates=self.updates(criterion))
+        self.scoring_function = theano.function(inputs=list(inputs[0:3]), outputs=[positive_score])
+
+    def updates(self, cost):
+        """ Compute the updates to perform a SGD step w.r.t. a given cost.
+
+        Keyword arguments:
+        cost -- The cost to optimise.
+        """
+        lr_relations = self.hyperparameters['relation_learning_rate']
+        lr_embeddings = self.hyperparameters['embeddings_learning_rate']
+        return self.relations.updates(cost, lr_relations) + self.embeddings.updates(cost, lr_embeddings)
+
+    def train(self):
+        """ Train the model. """
+        print >>sys.stderr, '# Training the model "{0}"'.format(self.tag)
+
+        batch_size = self.hyperparameters['train_batch_size']
+        validation_frequency = self.hyperparameters['validation_frequency']
+        number_epoch = self.hyperparameters['number_epoch']
+
+        for epoch in xrange(number_epoch):
+            if epoch % validation_frequency == 0:
+                self.validate(epoch)
+
+            for (relation, left_positive, right_positive, left_negative, right_negative) in self.dataset.training_minibatch(batch_size):
+                c1=self.train_function(relation, left_positive, right_positive, left_positive, right_negative)
+                c2=self.train_function(relation, left_positive, right_positive, left_negative, right_positive)
+
+    def error(self, name):
+        """ Compute the mean rank and top 10 on a given data. """
+        batch_size = self.hyperparameters['test_batch_size']
+        count, mean, top10 = 0, 0, 0
+        for (relation, left, right) in self.dataset.iterate(name, batch_size):
+            scores = None
+            for entities in self.dataset.universe_minibatch(batch_size):
+                batch_result = self.scoring_function(relation, left, entities)
+                scores = numpy.array(batch_result, dtype=theano.config.floatX) if scores is None else numpy.concatenate((scores, batch_result), axis=1)
+            rank = 1+numpy.where(numpy.argsort(scores)==right.indices[0])[1] # FIXME ugly
+            mean = mean + rank
+            count = count + 1
+            top10 = top10 + (rank<=10)
+        mean = float(mean) / count
+        top10 = float(top10) / count
+        return (mean, top10)
+
+    def validate(self, epoch):
+        """ Validate the model. """
+        print >>sys.stderr, 'Validation epoch {:<5}'.format(epoch),
+        (valid_mean, valid_top10) = self.error('valid')
+        (train_mean, train_top10) = self.error('train')
+        print >>sys.stderr, 'valid mean: {0:<15} valid top10: {1:<15} train mean: {0:<15} train top10: {1:<15}'.format(valid_mean, valid_top10, train_mean, train_top10)
+
+    def test(self):
+        """ Test the model. """
+        print >>sys.stderr, '# Testing the model "{0}"'.format(self.tag),
+        (mean, top10) = self.error('test')
+        print >>sys.stderr, ' mean: {0:<15} top10: {1:<15}'.format(mean, top10)
diff --git a/relations/__init__.py b/relations/__init__.py
diff --git a/relations/translation.py b/relations/translation.py
@@ -1,53 +0,0 @@
-#!/usr/bin/env python2
-
-import numpy
-import theano
-import theano.tensor as T
-import theano.sparse as S
-
-class Translations(object):
-    """ Translations class.
-
-    This class has one parameter:
-    R -- the translations
-    """
-    def __init__(self, rng, number, dimension, tag):
-        """ Initialise the parameter.
-
-        Keyword arguments:
-        rng -- module for random number generation
-        number -- number of relation
-        dimension -- dimension of the embeddings
-        tag -- name of the relations for parameter declaration
-        """
-
-        self.number = number
-        self.dimension = dimension
-
-        R_bound = numpy.sqrt(6. / dimension)
-        R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension))
-        R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1))
-        self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX))
-
-        self.params = [R]
-
-    def L1_norm(self):
-        """ Compute the L1-norm of the relations parameter. """
-        return T.sum(T.abs(self.R))
-
-    def sqrL2_norm(self):
-        """ Compute the squared L2-norm of the relations parameter. """
-        return T.sum(T.sqr(self.R))
-
-    def apply(self, input, relations):
-        """ Apply the given relations to a given input. """
-        return S.dot(relations, self.R)+inputs
-
-    def sgd_updates(self, cost, learning_rate):
-        """ Compute the updates to perform a SGD step w.r.t. a given cost.
-
-        Keyword arguments:
-        cost -- The cost to optimise.
-        learning_rate -- The learning rate used for gradient descent.
-        """
-        return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))]
diff --git a/relations/translations.py b/relations/translations.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python2
+
+import numpy
+import theano
+import theano.tensor as T
+import theano.sparse as S
+
+class Translations(object):
+    """ Translations class.
+
+    This class has one parameter:
+    R -- the translations
+    """
+    def __init__(self, rng, number, dimension, tag):
+        """ Initialise the parameter.
+
+        Keyword arguments:
+        rng -- module for random number generation
+        number -- number of relation
+        dimension -- dimension of the embeddings
+        tag -- name of the relations for parameter declaration
+        """
+
+        self.number = number
+        self.dimension = dimension
+
+        R_bound = numpy.sqrt(6. / dimension)
+        R_values = rng.uniform(low=-R_bound, high=R_bound, size=(number, dimension))
+        R_values = R_values / numpy.sqrt(numpy.sum(R_values **2, axis=1))[:, numpy.newaxis]
+        self.R = theano.shared(name=tag, value=numpy.asarray(R_values, dtype=theano.config.floatX))
+
+        self.parameters = [self.R]
+
+    def apply(self, inputs, relations):
+        """ Apply the given relations to a given input. """
+        return S.dot(relations, self.R)+inputs
+
+    def updates(self, cost, learning_rate):
+        """ Compute the updates to perform a SGD step w.r.t. a given cost.
+
+        Keyword arguments:
+        cost -- The cost to optimise.
+        learning_rate -- The learning rate used for gradient descent.
+        """
+        return [(self.R, self.R - learning_rate * T.grad(cost=cost, wrt=self.R))]
diff --git a/utils/__init__.py b/utils/__init__.py
diff --git a/utils/construct_dummy_dataset.py b/utils/construct_dummy_dataset.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python2
+
+import sys
+import os
+import shutil
+import random
+
+def construct_dummy_dataset(kind, prefix, n_embeddings, n_relations):
+    os.mkdir(prefix)
+
+    with open(prefix+'/embeddings', 'w') as file:
+        for i in xrange(n_embeddings):
+            file.write('E{0}\n'.format(i))
+
+    with open(prefix+'/relations', 'w') as file:
+        for i in xrange(n_relations):
+            file.write('R{0}\n'.format(i))
+
+    with open(prefix+'/train', 'w') as file:
+        for r in xrange(n_relations):
+            right = range(n_embeddings/2)
+            random.shuffle(right)
+            if kind=='id':
+                for e in xrange(n_embeddings):
+                    file.write('{0}\t{1}\t{2}\n'.format(e, r, e))
+            elif kind=='halfperm':
+                for e in xrange(n_embeddings/2):
+                    file.write('{0}\t{1}\t{2}\n'.format(e, r, right[e]+n_embeddings/2))
+            else:
+                raise error('Unknown kind')
+
+    shutil.copyfile(prefix+'/train', prefix+'/valid')
+    shutil.copyfile(prefix+'/train', prefix+'/test')
+
+if __name__ == '__main__':
+    if len(sys.argv)<5:
+        print >>sys.stderr, 'Usage: {0} {{id, halfperm}} dataset_name n_embeddings n_relations'.format(sys.argv[0])
+        sys.exit(1)
+    kind = sys.argv[1]
+    prefix = sys.argv[2]
+
+    n_embeddings = int(sys.argv[3])
+    n_relations = int(sys.argv[4])
+
+    construct_dummy_dataset(kind, prefix, n_embeddings, n_relations)