mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
300 lines
11 KiB
Python
300 lines
11 KiB
Python
import cPickle as pkl
|
|
import time
|
|
|
|
import numpy
|
|
import theano
|
|
from theano import config
|
|
import theano.tensor as T
|
|
from theano.tensor.nnet import categorical_crossentropy
|
|
|
|
from fuel.datasets import TextFile
|
|
from fuel.streams import DataStream
|
|
from fuel.schemes import ConstantScheme
|
|
from fuel.transformers import Batch, Padding
|
|
|
|
|
|
# These files can be downloaded from
|
|
# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz
|
|
# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl
|
|
# don't forget to change the paths and gunzip train.txt.gz
|
|
TRAIN_FILE = '/u/brakelp/temp/traindata.txt'
|
|
VAL_FILE = '/u/brakelp/temp/valdata.txt'
|
|
DICT_FILE = '/u/brakelp/temp/dictionary.pkl'
|
|
|
|
|
|
def sequence_categorical_crossentropy(prediction, targets, mask):
|
|
prediction_flat = prediction.reshape(((prediction.shape[0] *
|
|
prediction.shape[1]),
|
|
prediction.shape[2]), ndim=2)
|
|
targets_flat = targets.flatten()
|
|
mask_flat = mask.flatten()
|
|
ce = categorical_crossentropy(prediction_flat, targets_flat)
|
|
return T.sum(ce * mask_flat)
|
|
|
|
|
|
def gauss_weight(ndim_in, ndim_out=None, sd=.005):
|
|
if ndim_out is None:
|
|
ndim_out = ndim_in
|
|
W = numpy.random.randn(ndim_in, ndim_out) * sd
|
|
return numpy.asarray(W, dtype=config.floatX)
|
|
|
|
|
|
class LogisticRegression(object):
|
|
"""Multi-class Logistic Regression Class
|
|
|
|
The logistic regression is fully described by a weight matrix :math:`W`
|
|
and bias vector :math:`b`. Classification is done by projecting data
|
|
points onto a set of hyperplanes, the distance to which is used to
|
|
determine a class membership probability.
|
|
"""
|
|
|
|
def __init__(self, input, n_in, n_out):
|
|
""" Initialize the parameters of the logistic regression
|
|
|
|
:type input: theano.tensor.TensorType
|
|
:param input: symbolic variable that describes the input of the
|
|
architecture (one minibatch)
|
|
|
|
:type n_in: int
|
|
:param n_in: number of input units, the dimension of the space in
|
|
which the datapoints lie
|
|
|
|
:type n_out: int
|
|
:param n_out: number of output units, the dimension of the space in
|
|
which the labels lie
|
|
|
|
"""
|
|
|
|
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
|
|
self.W = theano.shared(value=numpy.zeros((n_in, n_out),
|
|
dtype=theano.config.floatX),
|
|
name='W', borrow=True)
|
|
# initialize the baises b as a vector of n_out 0s
|
|
self.b = theano.shared(value=numpy.zeros((n_out,),
|
|
dtype=theano.config.floatX),
|
|
name='b', borrow=True)
|
|
|
|
# compute vector of class-membership probabilities in symbolic form
|
|
energy = T.dot(input, self.W) + self.b
|
|
energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])
|
|
pmf = energy_exp / energy_exp.sum(2)[:, :, None]
|
|
self.p_y_given_x = pmf
|
|
|
|
# compute prediction as class whose probability is maximal in
|
|
# symbolic form
|
|
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
|
|
|
|
# parameters of the model
|
|
self.params = [self.W, self.b]
|
|
|
|
|
|
def index_dot(indices, w):
|
|
return w[indices.flatten()]
|
|
|
|
|
|
class LstmLayer:
|
|
|
|
def __init__(self, rng, input, mask, n_in, n_h):
|
|
|
|
# Init params
|
|
self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)
|
|
self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)
|
|
self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)
|
|
self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)
|
|
|
|
self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)
|
|
self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)
|
|
self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)
|
|
self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)
|
|
|
|
self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
|
|
'b_i', borrow=True)
|
|
self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
|
|
'b_f', borrow=True)
|
|
self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
|
|
'b_c', borrow=True)
|
|
self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
|
|
'b_o', borrow=True)
|
|
|
|
self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
|
|
self.U_i, self.U_f, self.U_c, self.U_o,
|
|
self.b_i, self.b_f, self.b_c, self.b_o]
|
|
|
|
outputs_info = [T.zeros((input.shape[1], n_h)),
|
|
T.zeros((input.shape[1], n_h))]
|
|
|
|
rval, updates = theano.scan(self._step,
|
|
sequences=[mask, input],
|
|
outputs_info=outputs_info)
|
|
|
|
# self.output is in the format (batchsize, n_h)
|
|
self.output = rval[0]
|
|
|
|
def _step(self, m_, x_, h_, c_):
|
|
|
|
i_preact = (index_dot(x_, self.W_i) +
|
|
T.dot(h_, self.U_i) + self.b_i)
|
|
i = T.nnet.sigmoid(i_preact)
|
|
|
|
f_preact = (index_dot(x_, self.W_f) +
|
|
T.dot(h_, self.U_f) + self.b_f)
|
|
f = T.nnet.sigmoid(f_preact)
|
|
|
|
o_preact = (index_dot(x_, self.W_o) +
|
|
T.dot(h_, self.U_o) + self.b_o)
|
|
o = T.nnet.sigmoid(o_preact)
|
|
|
|
c_preact = (index_dot(x_, self.W_c) +
|
|
T.dot(h_, self.U_c) + self.b_c)
|
|
c = T.tanh(c_preact)
|
|
|
|
c = f * c_ + i * c
|
|
c = m_[:, None] * c + (1. - m_)[:, None] * c_
|
|
|
|
h = o * T.tanh(c)
|
|
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
|
|
|
return h, c
|
|
|
|
|
|
def train_model(batch_size=100, n_h=50, n_epochs=40):
|
|
|
|
# Load the datasets with Fuel
|
|
dictionary = pkl.load(open(DICT_FILE, 'r'))
|
|
dictionary['~'] = len(dictionary)
|
|
reverse_mapping = dict((j, i) for i, j in dictionary.items())
|
|
|
|
print("Loading the data")
|
|
train = TextFile(files=[TRAIN_FILE],
|
|
dictionary=dictionary,
|
|
unk_token='~',
|
|
level='character',
|
|
preprocess=str.lower,
|
|
bos_token=None,
|
|
eos_token=None)
|
|
|
|
train_stream = DataStream.default_stream(train)
|
|
|
|
# organize data in batches and pad shorter sequences with zeros
|
|
train_stream = Batch(train_stream,
|
|
iteration_scheme=ConstantScheme(batch_size))
|
|
train_stream = Padding(train_stream)
|
|
|
|
# idem dito for the validation text
|
|
val = TextFile(files=[VAL_FILE],
|
|
dictionary=dictionary,
|
|
unk_token='~',
|
|
level='character',
|
|
preprocess=str.lower,
|
|
bos_token=None,
|
|
eos_token=None)
|
|
|
|
val_stream = DataStream.default_stream(val)
|
|
|
|
# organize data in batches and pad shorter sequences with zeros
|
|
val_stream = Batch(val_stream,
|
|
iteration_scheme=ConstantScheme(batch_size))
|
|
val_stream = Padding(val_stream)
|
|
|
|
print('Building model')
|
|
|
|
# Set the random number generator' seeds for consistency
|
|
rng = numpy.random.RandomState(12345)
|
|
|
|
x = T.lmatrix('x')
|
|
mask = T.matrix('mask')
|
|
|
|
# Construct the LSTM layer
|
|
recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)
|
|
|
|
logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
|
|
n_in=n_h, n_out=111)
|
|
|
|
cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
|
|
x[1:],
|
|
mask[1:]) / batch_size
|
|
|
|
# create a list of all model parameters to be fit by gradient descent
|
|
params = logreg_layer.params + recurrent_layer.params
|
|
|
|
# create a list of gradients for all model parameters
|
|
grads = T.grad(cost, params)
|
|
|
|
# update_model is a function that updates the model parameters by
|
|
# SGD Since this model has many parameters, it would be tedious to
|
|
# manually create an update rule for each model parameter. We thus
|
|
# create the updates list by automatically looping over all
|
|
# (params[i], grads[i]) pairs.
|
|
learning_rate = 0.1
|
|
updates = [
|
|
(param_i, param_i - learning_rate * grad_i)
|
|
for param_i, grad_i in zip(params, grads)
|
|
]
|
|
|
|
update_model = theano.function([x, mask], cost, updates=updates)
|
|
|
|
evaluate_model = theano.function([x, mask], cost)
|
|
|
|
# Define and compile a function for generating a sequence step by step.
|
|
x_t = T.iscalar()
|
|
h_p = T.vector()
|
|
c_p = T.vector()
|
|
h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
|
|
energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b
|
|
|
|
energy_exp = T.exp(energy - T.max(energy, 1)[:, None])
|
|
|
|
output = energy_exp / energy_exp.sum(1)[:, None]
|
|
single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])
|
|
|
|
start_time = time.clock()
|
|
|
|
iteration = 0
|
|
|
|
for epoch in range(n_epochs):
|
|
print 'epoch:', epoch
|
|
|
|
for x_, mask_ in train_stream.get_epoch_iterator():
|
|
iteration += 1
|
|
|
|
cross_entropy = update_model(x_.T, mask_.T)
|
|
|
|
|
|
# Generate some text after each 20 minibatches
|
|
if iteration % 40 == 0:
|
|
try:
|
|
prediction = numpy.ones(111, dtype=config.floatX) / 111.0
|
|
h_p = numpy.zeros((n_h,), dtype=config.floatX)
|
|
c_p = numpy.zeros((n_h,), dtype=config.floatX)
|
|
initial = 'the meaning of life is '
|
|
sentence = initial
|
|
for char in initial:
|
|
x_t = dictionary[char]
|
|
prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
|
|
c_p.flatten())
|
|
sample = numpy.random.multinomial(1, prediction.flatten())
|
|
for i in range(450):
|
|
x_t = numpy.argmax(sample)
|
|
prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
|
|
c_p.flatten())
|
|
sentence += reverse_mapping[x_t]
|
|
sample = numpy.random.multinomial(1, prediction.flatten())
|
|
print 'LSTM: "' + sentence + '"'
|
|
except ValueError:
|
|
print 'Something went wrong during sentence generation.'
|
|
|
|
if iteration % 40 == 0:
|
|
print 'epoch:', epoch, ' minibatch:', iteration
|
|
val_scores = []
|
|
for x_val, mask_val in val_stream.get_epoch_iterator():
|
|
val_scores.append(evaluate_model(x_val.T, mask_val.T))
|
|
print 'Average validation CE per sentence:', numpy.mean(val_scores)
|
|
|
|
end_time = time.clock()
|
|
print('Optimization complete.')
|
|
print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
train_model()
|