data-science-ipython-notebooks/deep-learning/theano-tutorial/rnn_tutorial/lstm_text.py

import cPickle as pkl
import time

import numpy
import theano
from theano import config
import theano.tensor as T
from theano.tensor.nnet import categorical_crossentropy

from fuel.datasets import TextFile
from fuel.streams import DataStream
from fuel.schemes import ConstantScheme
from fuel.transformers import Batch, Padding


# These files can be downloaded from
# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz
# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl
# don't forget to change the paths and gunzip train.txt.gz
TRAIN_FILE = '/u/brakelp/temp/traindata.txt'
VAL_FILE = '/u/brakelp/temp/valdata.txt'
DICT_FILE = '/u/brakelp/temp/dictionary.pkl'


def sequence_categorical_crossentropy(prediction, targets, mask):
    prediction_flat = prediction.reshape(((prediction.shape[0] *
                                           prediction.shape[1]),
                                          prediction.shape[2]), ndim=2)
    targets_flat = targets.flatten()
    mask_flat = mask.flatten()
    ce = categorical_crossentropy(prediction_flat, targets_flat)
    return T.sum(ce * mask_flat)


def gauss_weight(ndim_in, ndim_out=None, sd=.005):
    if ndim_out is None:
        ndim_out = ndim_in
    W = numpy.random.randn(ndim_in, ndim_out) * sd
    return numpy.asarray(W, dtype=config.floatX)


class LogisticRegression(object):
    """Multi-class Logistic Regression Class

    The logistic regression is fully described by a weight matrix :math:`W`
    and bias vector :math:`b`. Classification is done by projecting data
    points onto a set of hyperplanes, the distance to which is used to
    determine a class membership probability.
    """

    def __init__(self, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                                 dtype=theano.config.floatX),
                               name='W', borrow=True)
        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(value=numpy.zeros((n_out,),
                                                 dtype=theano.config.floatX),
                               name='b', borrow=True)

        # compute vector of class-membership probabilities in symbolic form
        energy = T.dot(input, self.W) + self.b
        energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])
        pmf = energy_exp / energy_exp.sum(2)[:, :, None]
        self.p_y_given_x = pmf

        # compute prediction as class whose probability is maximal in
        # symbolic form
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # parameters of the model
        self.params = [self.W, self.b]


def index_dot(indices, w):
    return w[indices.flatten()]


class LstmLayer:

    def __init__(self, rng, input, mask, n_in, n_h):

        # Init params
        self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)
        self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)
        self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)
        self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)

        self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)
        self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)
        self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)
        self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)

        self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
                                 'b_i', borrow=True)
        self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
                                 'b_f', borrow=True)
        self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
                                 'b_c', borrow=True)
        self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
                                 'b_o', borrow=True)

        self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
                       self.U_i, self.U_f, self.U_c, self.U_o,
                       self.b_i, self.b_f, self.b_c, self.b_o]

        outputs_info = [T.zeros((input.shape[1], n_h)),
                        T.zeros((input.shape[1], n_h))]

        rval, updates = theano.scan(self._step,
                                    sequences=[mask, input],
                                    outputs_info=outputs_info)

        # self.output is in the format (batchsize, n_h)
        self.output = rval[0]

    def _step(self, m_, x_, h_, c_):

        i_preact = (index_dot(x_, self.W_i) +
                    T.dot(h_, self.U_i) + self.b_i)
        i = T.nnet.sigmoid(i_preact)

        f_preact = (index_dot(x_, self.W_f) +
                    T.dot(h_, self.U_f) + self.b_f)
        f = T.nnet.sigmoid(f_preact)

        o_preact = (index_dot(x_, self.W_o) +
                    T.dot(h_, self.U_o) + self.b_o)
        o = T.nnet.sigmoid(o_preact)

        c_preact = (index_dot(x_, self.W_c) +
                    T.dot(h_, self.U_c) + self.b_c)
        c = T.tanh(c_preact)

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * T.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c


def train_model(batch_size=100, n_h=50, n_epochs=40):

    # Load the datasets with Fuel
    dictionary = pkl.load(open(DICT_FILE, 'r'))
    dictionary['~'] = len(dictionary)
    reverse_mapping = dict((j, i) for i, j in dictionary.items())

    print("Loading the data")
    train = TextFile(files=[TRAIN_FILE],
                     dictionary=dictionary,
                     unk_token='~',
                     level='character',
                     preprocess=str.lower,
                     bos_token=None,
                     eos_token=None)

    train_stream = DataStream.default_stream(train)

    # organize data in batches and pad shorter sequences with zeros
    train_stream = Batch(train_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    train_stream = Padding(train_stream)

    # idem dito for the validation text
    val = TextFile(files=[VAL_FILE],
                     dictionary=dictionary,
                     unk_token='~',
                     level='character',
                     preprocess=str.lower,
                     bos_token=None,
                     eos_token=None)

    val_stream = DataStream.default_stream(val)

    # organize data in batches and pad shorter sequences with zeros
    val_stream = Batch(val_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    val_stream = Padding(val_stream)

    print('Building model')

    # Set the random number generator' seeds for consistency
    rng = numpy.random.RandomState(12345)

    x = T.lmatrix('x')
    mask = T.matrix('mask')

    # Construct the LSTM layer
    recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)

    logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
                                      n_in=n_h, n_out=111)

    cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
                                             x[1:],
                                             mask[1:]) / batch_size

    # create a list of all model parameters to be fit by gradient descent
    params = logreg_layer.params + recurrent_layer.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # update_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    learning_rate = 0.1
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    update_model = theano.function([x, mask], cost, updates=updates)

    evaluate_model = theano.function([x, mask], cost)

    # Define and compile a function for generating a sequence step by step.
    x_t = T.iscalar()
    h_p = T.vector()
    c_p = T.vector()
    h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
    energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b

    energy_exp = T.exp(energy - T.max(energy, 1)[:, None])

    output = energy_exp / energy_exp.sum(1)[:, None]
    single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])

    start_time = time.clock()

    iteration = 0

    for epoch in range(n_epochs):
        print 'epoch:', epoch

        for x_, mask_ in train_stream.get_epoch_iterator():
            iteration += 1

            cross_entropy = update_model(x_.T, mask_.T)


            # Generate some text after each 20 minibatches
            if iteration % 40 == 0:
                try:
                    prediction = numpy.ones(111, dtype=config.floatX) / 111.0
                    h_p = numpy.zeros((n_h,), dtype=config.floatX)
                    c_p = numpy.zeros((n_h,), dtype=config.floatX)
                    initial = 'the meaning of life is '
                    sentence = initial
                    for char in initial:
                        x_t = dictionary[char]
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
                                                           c_p.flatten())
                    sample = numpy.random.multinomial(1, prediction.flatten())
                    for i in range(450):
                        x_t = numpy.argmax(sample)
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
                                                           c_p.flatten())
                        sentence += reverse_mapping[x_t]
                        sample = numpy.random.multinomial(1, prediction.flatten())
                    print 'LSTM: "' + sentence + '"'
                except ValueError:
                    print 'Something went wrong during sentence generation.'

            if iteration % 40 == 0:
                print 'epoch:', epoch, '  minibatch:', iteration
                val_scores = []
                for x_val, mask_val in val_stream.get_epoch_iterator():
                    val_scores.append(evaluate_model(x_val.T, mask_val.T))
                print 'Average validation CE per sentence:', numpy.mean(val_scores)

    end_time = time.clock()
    print('Optimization complete.')
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.))


if __name__ == '__main__':
    train_model()
Add Theano recurrent neural networks notebook. 2015-12-27 22:31:45 +08:00			`import cPickle as pkl`
			`import time`

			`import numpy`
			`import theano`
			`from theano import config`
			`import theano.tensor as T`
			`from theano.tensor.nnet import categorical_crossentropy`

			`from fuel.datasets import TextFile`
			`from fuel.streams import DataStream`
			`from fuel.schemes import ConstantScheme`
			`from fuel.transformers import Batch, Padding`


			`# These files can be downloaded from`
			`# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz`
			`# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl`
			`# don't forget to change the paths and gunzip train.txt.gz`
			`TRAIN_FILE = '/u/brakelp/temp/traindata.txt'`
			`VAL_FILE = '/u/brakelp/temp/valdata.txt'`
			`DICT_FILE = '/u/brakelp/temp/dictionary.pkl'`


			`def sequence_categorical_crossentropy(prediction, targets, mask):`
			`prediction_flat = prediction.reshape(((prediction.shape[0] *`
			`prediction.shape[1]),`
			`prediction.shape[2]), ndim=2)`
			`targets_flat = targets.flatten()`
			`mask_flat = mask.flatten()`
			`ce = categorical_crossentropy(prediction_flat, targets_flat)`
			`return T.sum(ce * mask_flat)`


			`def gauss_weight(ndim_in, ndim_out=None, sd=.005):`
			`if ndim_out is None:`
			`ndim_out = ndim_in`
			`W = numpy.random.randn(ndim_in, ndim_out) * sd`
			`return numpy.asarray(W, dtype=config.floatX)`


			`class LogisticRegression(object):`
			`"""Multi-class Logistic Regression Class`

			The logistic regression is fully described by a weight matrix :math:`W`
			and bias vector :math:`b`. Classification is done by projecting data
			`points onto a set of hyperplanes, the distance to which is used to`
			`determine a class membership probability.`
			`"""`

			`def __init__(self, input, n_in, n_out):`
			`""" Initialize the parameters of the logistic regression`

			`:type input: theano.tensor.TensorType`
			`:param input: symbolic variable that describes the input of the`
			`architecture (one minibatch)`

			`:type n_in: int`
			`:param n_in: number of input units, the dimension of the space in`
			`which the datapoints lie`

			`:type n_out: int`
			`:param n_out: number of output units, the dimension of the space in`
			`which the labels lie`

			`"""`

			`# initialize with 0 the weights W as a matrix of shape (n_in, n_out)`
			`self.W = theano.shared(value=numpy.zeros((n_in, n_out),`
			`dtype=theano.config.floatX),`
			`name='W', borrow=True)`
			`# initialize the baises b as a vector of n_out 0s`
			`self.b = theano.shared(value=numpy.zeros((n_out,),`
			`dtype=theano.config.floatX),`
			`name='b', borrow=True)`

			`# compute vector of class-membership probabilities in symbolic form`
			`energy = T.dot(input, self.W) + self.b`
			`energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])`
			`pmf = energy_exp / energy_exp.sum(2)[:, :, None]`
			`self.p_y_given_x = pmf`

			`# compute prediction as class whose probability is maximal in`
			`# symbolic form`
			`self.y_pred = T.argmax(self.p_y_given_x, axis=1)`

			`# parameters of the model`
			`self.params = [self.W, self.b]`


			`def index_dot(indices, w):`
			`return w[indices.flatten()]`


			`class LstmLayer:`

			`def __init__(self, rng, input, mask, n_in, n_h):`

			`# Init params`
			`self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)`
			`self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)`
			`self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)`
			`self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)`

			`self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)`
			`self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)`
			`self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)`
			`self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)`

			`self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),`
			`'b_i', borrow=True)`
			`self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),`
			`'b_f', borrow=True)`
			`self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),`
			`'b_c', borrow=True)`
			`self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),`
			`'b_o', borrow=True)`

			`self.params = [self.W_i, self.W_f, self.W_c, self.W_o,`
			`self.U_i, self.U_f, self.U_c, self.U_o,`
			`self.b_i, self.b_f, self.b_c, self.b_o]`

			`outputs_info = [T.zeros((input.shape[1], n_h)),`
			`T.zeros((input.shape[1], n_h))]`

			`rval, updates = theano.scan(self._step,`
			`sequences=[mask, input],`
			`outputs_info=outputs_info)`

			`# self.output is in the format (batchsize, n_h)`
			`self.output = rval[0]`

			`def _step(self, m_, x_, h_, c_):`

			`i_preact = (index_dot(x_, self.W_i) +`
			`T.dot(h_, self.U_i) + self.b_i)`
			`i = T.nnet.sigmoid(i_preact)`

			`f_preact = (index_dot(x_, self.W_f) +`
			`T.dot(h_, self.U_f) + self.b_f)`
			`f = T.nnet.sigmoid(f_preact)`

			`o_preact = (index_dot(x_, self.W_o) +`
			`T.dot(h_, self.U_o) + self.b_o)`
			`o = T.nnet.sigmoid(o_preact)`

			`c_preact = (index_dot(x_, self.W_c) +`
			`T.dot(h_, self.U_c) + self.b_c)`
			`c = T.tanh(c_preact)`

			`c = f * c_ + i * c`
			`c = m_[:, None] * c + (1. - m_)[:, None] * c_`

			`h = o * T.tanh(c)`
			`h = m_[:, None] * h + (1. - m_)[:, None] * h_`

			`return h, c`


			`def train_model(batch_size=100, n_h=50, n_epochs=40):`

			`# Load the datasets with Fuel`
			`dictionary = pkl.load(open(DICT_FILE, 'r'))`
			`dictionary['~'] = len(dictionary)`
			`reverse_mapping = dict((j, i) for i, j in dictionary.items())`

			`print("Loading the data")`
			`train = TextFile(files=[TRAIN_FILE],`
			`dictionary=dictionary,`
			`unk_token='~',`
			`level='character',`
			`preprocess=str.lower,`
			`bos_token=None,`
			`eos_token=None)`

			`train_stream = DataStream.default_stream(train)`

			`# organize data in batches and pad shorter sequences with zeros`
			`train_stream = Batch(train_stream,`
			`iteration_scheme=ConstantScheme(batch_size))`
			`train_stream = Padding(train_stream)`

			`# idem dito for the validation text`
			`val = TextFile(files=[VAL_FILE],`
			`dictionary=dictionary,`
			`unk_token='~',`
			`level='character',`
			`preprocess=str.lower,`
			`bos_token=None,`
			`eos_token=None)`

			`val_stream = DataStream.default_stream(val)`

			`# organize data in batches and pad shorter sequences with zeros`
			`val_stream = Batch(val_stream,`
			`iteration_scheme=ConstantScheme(batch_size))`
			`val_stream = Padding(val_stream)`

			`print('Building model')`

			`# Set the random number generator' seeds for consistency`
			`rng = numpy.random.RandomState(12345)`

			`x = T.lmatrix('x')`
			`mask = T.matrix('mask')`

			`# Construct the LSTM layer`
			`recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)`

			`logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],`
			`n_in=n_h, n_out=111)`

			`cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,`
			`x[1:],`
			`mask[1:]) / batch_size`

			`# create a list of all model parameters to be fit by gradient descent`
			`params = logreg_layer.params + recurrent_layer.params`

			`# create a list of gradients for all model parameters`
			`grads = T.grad(cost, params)`

			`# update_model is a function that updates the model parameters by`
			`# SGD Since this model has many parameters, it would be tedious to`
			`# manually create an update rule for each model parameter. We thus`
			`# create the updates list by automatically looping over all`
			`# (params[i], grads[i]) pairs.`
			`learning_rate = 0.1`
			`updates = [`
			`(param_i, param_i - learning_rate * grad_i)`
			`for param_i, grad_i in zip(params, grads)`
			`]`

			`update_model = theano.function([x, mask], cost, updates=updates)`

			`evaluate_model = theano.function([x, mask], cost)`

			`# Define and compile a function for generating a sequence step by step.`
			`x_t = T.iscalar()`
			`h_p = T.vector()`
			`c_p = T.vector()`
			`h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)`
			`energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b`

			`energy_exp = T.exp(energy - T.max(energy, 1)[:, None])`

			`output = energy_exp / energy_exp.sum(1)[:, None]`
			`single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])`

			`start_time = time.clock()`

			`iteration = 0`

			`for epoch in range(n_epochs):`
			`print 'epoch:', epoch`

			`for x_, mask_ in train_stream.get_epoch_iterator():`
			`iteration += 1`

			`cross_entropy = update_model(x_.T, mask_.T)`


			`# Generate some text after each 20 minibatches`
			`if iteration % 40 == 0:`
			`try:`
			`prediction = numpy.ones(111, dtype=config.floatX) / 111.0`
			`h_p = numpy.zeros((n_h,), dtype=config.floatX)`
			`c_p = numpy.zeros((n_h,), dtype=config.floatX)`
			`initial = 'the meaning of life is '`
			`sentence = initial`
			`for char in initial:`
			`x_t = dictionary[char]`
			`prediction, h_p, c_p = single_step(x_t, h_p.flatten(),`
			`c_p.flatten())`
			`sample = numpy.random.multinomial(1, prediction.flatten())`
			`for i in range(450):`
			`x_t = numpy.argmax(sample)`
			`prediction, h_p, c_p = single_step(x_t, h_p.flatten(),`
			`c_p.flatten())`
			`sentence += reverse_mapping[x_t]`
			`sample = numpy.random.multinomial(1, prediction.flatten())`
			`print 'LSTM: "' + sentence + '"'`
			`except ValueError:`
			`print 'Something went wrong during sentence generation.'`

			`if iteration % 40 == 0:`
			`print 'epoch:', epoch, ' minibatch:', iteration`
			`val_scores = []`
			`for x_val, mask_val in val_stream.get_epoch_iterator():`
			`val_scores.append(evaluate_model(x_val.T, mask_val.T))`
			`print 'Average validation CE per sentence:', numpy.mean(val_scores)`

			`end_time = time.clock()`
			`print('Optimization complete.')`
			`print('The code ran for %.2fm' % ((end_time - start_time) / 60.))`


			`if __name__ == '__main__':`
			`train_model()`