diff --git a/.gitignore b/.gitignore index e3d8857a855..0b2f7966558 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,13 @@ advanced #data things beginner_source/hymenoptera_data intermediate_source/data/ +advanced_source/images/ *data.zip +#builds +_build/ +_static/thumbs/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/beginner_source/deep_learning_nlp_tutorial.py b/beginner_source/deep_learning_nlp_tutorial.py deleted file mode 100644 index aecc9c0ba89..00000000000 --- a/beginner_source/deep_learning_nlp_tutorial.py +++ /dev/null @@ -1,1643 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Deep Learning for Natural Language Processing with Pytorch -********************************************************** -**Author**: `Robert Guthrie `_ - -This tutorial will walk you through the key ideas of deep learning -programming using Pytorch. Many of the concepts (such as the computation -graph abstraction and autograd) are not unique to Pytorch and are -relevant to any deep learning tool kit out there. - -I am writing this tutorial to focus specifically on NLP for people who -have never written code in any deep learning framework (e.g, TensorFlow, -Theano, Keras, Dynet). It assumes working knowledge of core NLP -problems: part-of-speech tagging, language modeling, etc. It also -assumes familiarity with neural networks at the level of an intro AI -class (such as one from the Russel and Norvig book). Usually, these -courses cover the basic backpropagation algorithm on feed-forward neural -networks, and make the point that they are chains of compositions of -linearities and non-linearities. This tutorial aims to get you started -writing deep learning code, given you have this prerequisite knowledge. - -Note this is about *models*, not data. For all of the models, I just -create a few test examples with small dimensionality so you can see how -the weights change as it trains. If you have some real data you want to -try, you should be able to rip out any of the models from this notebook -and use them on it. - -""" - -import torch -import torch.autograd as autograd -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim - -torch.manual_seed(1) - - -###################################################################### -# 1. Introduction to Torch's tensor library -# ========================================= -# - - -###################################################################### -# All of deep learning is computations on tensors, which are -# generalizations of a matrix that can be indexed in more than 2 -# dimensions. We will see exactly what this means in-depth later. First, -# lets look what we can do with tensors. -# - - -###################################################################### -# Creating Tensors -# ~~~~~~~~~~~~~~~~ -# -# Tensors can be created from Python lists with the torch.Tensor() -# function. -# - -# Create a torch.Tensor object with the given data. It is a 1D vector -V_data = [1., 2., 3.] -V = torch.Tensor(V_data) -print(V) - -# Creates a matrix -M_data = [[1., 2., 3.], [4., 5., 6]] -M = torch.Tensor(M_data) -print(M) - -# Create a 3D tensor of size 2x2x2. -T_data = [[[1., 2.], [3., 4.]], - [[5., 6.], [7., 8.]]] -T = torch.Tensor(T_data) -print(T) - - -###################################################################### -# What is a 3D tensor anyway? Think about it like this. If you have a -# vector, indexing into the vector gives you a scalar. If you have a -# matrix, indexing into the matrix gives you a vector. If you have a 3D -# tensor, then indexing into the tensor gives you a matrix! -# -# A note on terminology: -# when I say "tensor" in this tutorial, it refers -# to any torch.Tensor object. Matrices and vectors are special cases of -# torch.Tensors, where their dimension is 1 and 2 respectively. When I am -# talking about 3D tensors, I will explicitly use the term "3D tensor". -# - -# Index into V and get a scalar -print(V[0]) - -# Index into M and get a vector -print(M[0]) - -# Index into T and get a matrix -print(T[0]) - - -###################################################################### -# You can also create tensors of other datatypes. The default, as you can -# see, is Float. To create a tensor of integer types, try -# torch.LongTensor(). Check the documentation for more data types, but -# Float and Long will be the most common. -# - - -###################################################################### -# You can create a tensor with random data and the supplied dimensionality -# with torch.randn() -# - -x = torch.randn((3, 4, 5)) -print(x) - - -###################################################################### -# Operations with Tensors -# ~~~~~~~~~~~~~~~~~~~~~~~ -# -# You can operate on tensors in the ways you would expect. - -x = torch.Tensor([1., 2., 3.]) -y = torch.Tensor([4., 5., 6.]) -z = x + y -print(z) - - -###################################################################### -# See `the documentation `__ for a -# complete list of the massive number of operations available to you. They -# expand beyond just mathematical operations. -# -# One helpful operation that we will make use of later is concatenation. -# - -# By default, it concatenates along the first axis (concatenates rows) -x_1 = torch.randn(2, 5) -y_1 = torch.randn(3, 5) -z_1 = torch.cat([x_1, y_1]) -print(z_1) - -# Concatenate columns: -x_2 = torch.randn(2, 3) -y_2 = torch.randn(2, 5) -# second arg specifies which axis to concat along -z_2 = torch.cat([x_2, y_2], 1) -print(z_2) - -# If your tensors are not compatible, torch will complain. Uncomment to see the error -# torch.cat([x_1, x_2]) - - -###################################################################### -# Reshaping Tensors -# ~~~~~~~~~~~~~~~~~ -# -# Use the .view() method to reshape a tensor. This method receives heavy -# use, because many neural network components expect their inputs to have -# a certain shape. Often you will need to reshape before passing your data -# to the component. -# - -x = torch.randn(2, 3, 4) -print(x) -print(x.view(2, 12)) # Reshape to 2 rows, 12 columns -# Same as above. If one of the dimensions is -1, its size can be inferred -print(x.view(2, -1)) - - -###################################################################### -# 2. Computation Graphs and Automatic Differentiation -# =================================================== -# - - -###################################################################### -# The concept of a computation graph is essential to efficient deep -# learning programming, because it allows you to not have to write the -# back propagation gradients yourself. A computation graph is simply a -# specification of how your data is combined to give you the output. Since -# the graph totally specifies what parameters were involved with which -# operations, it contains enough information to compute derivatives. This -# probably sounds vague, so lets see what is going on using the -# fundamental class of Pytorch: autograd.Variable. -# -# First, think from a programmers perspective. What is stored in the -# torch.Tensor objects we were creating above? Obviously the data and the -# shape, and maybe a few other things. But when we added two tensors -# together, we got an output tensor. All this output tensor knows is its -# data and shape. It has no idea that it was the sum of two other tensors -# (it could have been read in from a file, it could be the result of some -# other operation, etc.) -# -# The Variable class keeps track of how it was created. Lets see it in -# action. -# - -# Variables wrap tensor objects -x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True) -# You can access the data with the .data attribute -print(x.data) - -# You can also do all the same operations you did with tensors with Variables. -y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True) -z = x + y -print(z.data) - -# BUT z knows something extra. -print(z.creator) - - -###################################################################### -# So Variables know what created them. z knows that it wasn't read in from -# a file, it wasn't the result of a multiplication or exponential or -# whatever. And if you keep following z.creator, you will find yourself at -# x and y. -# -# But how does that help us compute a gradient? -# - -# Lets sum up all the entries in z -s = z.sum() -print(s) -print(s.creator) - - -###################################################################### -# So now, what is the derivative of this sum with respect to the first -# component of x? In math, we want -# -# .. math:: -# -# \frac{\partial s}{\partial x_0} -# -# -# -# Well, s knows that it was created as a sum of the tensor z. z knows -# that it was the sum x + y. So -# -# .. math:: s = \overbrace{x_0 + y_0}^\text{$z_0$} + \overbrace{x_1 + y_1}^\text{$z_1$} + \overbrace{x_2 + y_2}^\text{$z_2$} -# -# And so s contains enough information to determine that the derivative -# we want is 1! -# -# Of course this glosses over the challenge of how to actually compute -# that derivative. The point here is that s is carrying along enough -# information that it is possible to compute it. In reality, the -# developers of Pytorch program the sum() and + operations to know how to -# compute their gradients, and run the back propagation algorithm. An -# in-depth discussion of that algorithm is beyond the scope of this -# tutorial. -# - - -###################################################################### -# Lets have Pytorch compute the gradient, and see that we were right: -# (note if you run this block multiple times, the gradient will increment. -# That is because Pytorch *accumulates* the gradient into the .grad -# property, since for many models this is very convenient.) -# - -# calling .backward() on any variable will run backprop, starting from it. -s.backward() -print(x.grad) - - -###################################################################### -# Understanding what is going on in the block below is crucial for being a -# successful programmer in deep learning. -# - -x = torch.randn((2, 2)) -y = torch.randn((2, 2)) -z = x + y # These are Tensor types, and backprop would not be possible - -var_x = autograd.Variable(x) -var_y = autograd.Variable(y) -# var_z contains enough information to compute gradients, as we saw above -var_z = var_x + var_y -print(var_z.creator) - -var_z_data = var_z.data # Get the wrapped Tensor object out of var_z... -# Re-wrap the tensor in a new variable -new_var_z = autograd.Variable(var_z_data) - -# ... does new_var_z have information to backprop to x and y? -# NO! -print(new_var_z.creator) -# And how could it? We yanked the tensor out of var_z (that is -# what var_z.data is). This tensor doesn't know anything about -# how it was computed. We pass it into new_var_z, and this is all the -# information new_var_z gets. If var_z_data doesn't know how it was -# computed, theres no way new_var_z will. -# In essence, we have broken the variable away from its past history - - -###################################################################### -# Here is the basic, extremely important rule for computing with -# autograd.Variables (note this is more general than Pytorch. There is an -# equivalent object in every major deep learning toolkit): -# -# **If you want the error from your loss function to backpropogate to a -# component of your network, you MUST NOT break the Variable chain from -# that component to your loss Variable. If you do, the loss will have no -# idea your component exists, and its parameters can't be updated.** -# -# I say this in bold, because this error can creep up on you in very -# subtle ways (I will show some such ways below), and it will not cause -# your code to crash or complain, so you must be careful. -# - - -###################################################################### -# 3. Deep Learning Building Blocks: Affine maps, non-linearities and objectives -# ============================================================================= -# - - -###################################################################### -# Deep learning consists of composing linearities with non-linearities in -# clever ways. The introduction of non-linearities allows for powerful -# models. In this section, we will play with these core components, make -# up an objective function, and see how the model is trained. -# - - -###################################################################### -# Affine Maps -# ~~~~~~~~~~~ -# -# One of the core workhorses of deep learning is the affine map, which is -# a function :math:`f(x)` where -# -# .. math:: f(x) = Ax + b -# -# for a matrix :math:`A` and vectors :math:`x, b`. The parameters to be -# learned here are :math:`A` and :math:`b`. Often, :math:`b` is refered to -# as the *bias* term. -# - - -###################################################################### -# Pytorch and most other deep learning frameworks do things a little -# differently than traditional linear algebra. It maps the rows of the -# input instead of the columns. That is, the :math:`i`'th row of the -# output below is the mapping of the :math:`i`'th row of the input under -# :math:`A`, plus the bias term. Look at the example below. -# - -lin = nn.Linear(5, 3) # maps from R^5 to R^3, parameters A, b -# data is 2x5. A maps from 5 to 3... can we map "data" under A? -data = autograd.Variable(torch.randn(2, 5)) -print(lin(data)) # yes - - -###################################################################### -# Non-Linearities -# ~~~~~~~~~~~~~~~ -# -# First, note the following fact, which will explain why we need -# non-linearities in the first place. Suppose we have two affine maps -# :math:`f(x) = Ax + b` and :math:`g(x) = Cx + d`. What is -# :math:`f(g(x))`? -# -# .. math:: f(g(x)) = A(Cx + d) + b = ACx + (Ad + b) -# -# :math:`AC` is a matrix and :math:`Ad + b` is a vector, so we see that -# composing affine maps gives you an affine map. -# -# From this, you can see that if you wanted your neural network to be long -# chains of affine compositions, that this adds no new power to your model -# than just doing a single affine map. -# -# If we introduce non-linearities in between the affine layers, this is no -# longer the case, and we can build much more powerful models. -# -# There are a few core non-linearities. -# :math:`\tanh(x), \sigma(x), \text{ReLU}(x)` are the most common. You are -# probably wondering: "why these functions? I can think of plenty of other -# non-linearities." The reason for this is that they have gradients that -# are easy to compute, and computing gradients is essential for learning. -# For example -# -# .. math:: \frac{d\sigma}{dx} = \sigma(x)(1 - \sigma(x)) -# -# A quick note: although you may have learned some neural networks in your -# intro to AI class where :math:`\sigma(x)` was the default non-linearity, -# typically people shy away from it in practice. This is because the -# gradient *vanishes* very quickly as the absolute value of the argument -# grows. Small gradients means it is hard to learn. Most people default to -# tanh or ReLU. -# - -# In pytorch, most non-linearities are in torch.functional (we have it imported as F) -# Note that non-linearites typically don't have parameters like affine maps do. -# That is, they don't have weights that are updated during training. -data = autograd.Variable(torch.randn(2, 2)) -print(data) -print(F.relu(data)) - - -###################################################################### -# Softmax and Probabilities -# ~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The function :math:`\text{Softmax}(x)` is also just a non-linearity, but -# it is special in that it usually is the last operation done in a -# network. This is because it takes in a vector of real numbers and -# returns a probability distribution. Its definition is as follows. Let -# :math:`x` be a vector of real numbers (positive, negative, whatever, -# there are no constraints). Then the i'th component of -# :math:`\text{Softmax}(x)` is -# -# .. math:: \frac{\exp(x_i)}{\sum_j \exp(x_j)} -# -# It should be clear that the output is a probability distribution: each -# element is non-negative and the sum over all components is 1. -# -# You could also think of it as just applying an element-wise -# exponentiation operator to the input to make everything non-negative and -# then dividing by the normalization constant. -# - -# Softmax is also in torch.functional -data = autograd.Variable(torch.randn(5)) -print(data) -print(F.softmax(data)) -print(F.softmax(data).sum()) # Sums to 1 because it is a distribution! -print(F.log_softmax(data)) # theres also log_softmax - - -###################################################################### -# Objective Functions -# ~~~~~~~~~~~~~~~~~~~ -# -# The objective function is the function that your network is being -# trained to minimize (in which case it is often called a *loss function* -# or *cost function*). This proceeds by first choosing a training -# instance, running it through your neural network, and then computing the -# loss of the output. The parameters of the model are then updated by -# taking the derivative of the loss function. Intuitively, if your model -# is completely confident in its answer, and its answer is wrong, your -# loss will be high. If it is very confident in its answer, and its answer -# is correct, the loss will be low. -# -# The idea behind minimizing the loss function on your training examples -# is that your network will hopefully generalize well and have small loss -# on unseen examples in your dev set, test set, or in production. An -# example loss function is the *negative log likelihood loss*, which is a -# very common objective for multi-class classification. For supervised -# multi-class classification, this means training the network to minimize -# the negative log probability of the correct output (or equivalently, -# maximize the log probability of the correct output). -# - - -###################################################################### -# 4. Optimization and Training -# ============================ -# - - -###################################################################### -# So what we can compute a loss function for an instance? What do we do -# with that? We saw earlier that autograd.Variable's know how to compute -# gradients with respect to the things that were used to compute it. Well, -# since our loss is an autograd.Variable, we can compute gradients with -# respect to all of the parameters used to compute it! Then we can perform -# standard gradient updates. Let :math:`\theta` be our parameters, -# :math:`L(\theta)` the loss function, and :math:`\eta` a positive -# learning rate. Then: -# -# .. math:: \theta^{(t+1)} = \theta^{(t)} - \eta \nabla_\theta L(\theta) -# -# There are a huge collection of algorithms and active research in -# attempting to do something more than just this vanilla gradient update. -# Many attempt to vary the learning rate based on what is happening at -# train time. You don't need to worry about what specifically these -# algorithms are doing unless you are really interested. Torch provies -# many in the torch.optim package, and they are all completely -# transparent. Using the simplest gradient update is the same as the more -# complicated algorithms. Trying different update algorithms and different -# parameters for the update algorithms (like different initial learning -# rates) is important in optimizing your network's performance. Often, -# just replacing vanilla SGD with an optimizer like Adam or RMSProp will -# boost performance noticably. -# - - -###################################################################### -# 5. Creating Network Components in Pytorch -# ========================================= -# -# Before we move on to our focus on NLP, lets do an annotated example of -# building a network in Pytorch using only affine maps and -# non-linearities. We will also see how to compute a loss function, using -# Pytorch's built in negative log likelihood, and update parameters by -# backpropagation. -# -# All network components should inherit from nn.Module and override the -# forward() method. That is about it, as far as the boilerplate is -# concerned. Inheriting from nn.Module provides functionality to your -# component. For example, it makes it keep track of its trainable -# parameters, you can swap it between CPU and GPU with the .cuda() or -# .cpu() functions, etc. -# -# Let's write an annotated example of a network that takes in a sparse -# bag-of-words representation and outputs a probability distribution over -# two labels: "English" and "Spanish". This model is just logistic -# regression. -# - - -###################################################################### -# Example: Logistic Regression Bag-of-Words classifier -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Our model will map a sparse BOW representation to log probabilities over -# labels. We assign each word in the vocab an index. For example, say our -# entire vocab is two words "hello" and "world", with indices 0 and 1 -# respectively. The BoW vector for the sentence "hello hello hello hello" -# is -# -# .. math:: \left[ 4, 0 \right] -# -# For "hello world world hello", it is -# -# .. math:: \left[ 2, 2 \right] -# -# etc. In general, it is -# -# .. math:: \left[ \text{Count}(\text{hello}), \text{Count}(\text{world}) \right] -# -# Denote this BOW vector as :math:`x`. The output of our network is: -# -# .. math:: \log \text{Softmax}(Ax + b) -# -# That is, we pass the input through an affine map and then do log -# softmax. -# - -data = [("me gusta comer en la cafeteria".split(), "SPANISH"), - ("Give it to me".split(), "ENGLISH"), - ("No creo que sea una buena idea".split(), "SPANISH"), - ("No it is not a good idea to get lost at sea".split(), "ENGLISH")] - -test_data = [("Yo creo que si".split(), "SPANISH"), - ("it is lost on me".split(), "ENGLISH")] - -# word_to_ix maps each word in the vocab to a unique integer, which will be its -# index into the Bag of words vector -word_to_ix = {} -for sent, _ in data + test_data: - for word in sent: - if word not in word_to_ix: - word_to_ix[word] = len(word_to_ix) -print(word_to_ix) - -VOCAB_SIZE = len(word_to_ix) -NUM_LABELS = 2 - - -class BoWClassifier(nn.Module): # inheriting from nn.Module! - - def __init__(self, num_labels, vocab_size): - # calls the init function of nn.Module. Dont get confused by syntax, - # just always do it in an nn.Module - super(BoWClassifier, self).__init__() - - # Define the parameters that you will need. In this case, we need A and b, - # the parameters of the affine mapping. - # Torch defines nn.Linear(), which provides the affine map. - # Make sure you understand why the input dimension is vocab_size - # and the output is num_labels! - self.linear = nn.Linear(vocab_size, num_labels) - - # NOTE! The non-linearity log softmax does not have parameters! So we don't need - # to worry about that here - - def forward(self, bow_vec): - # Pass the input through the linear layer, - # then pass that through log_softmax. - # Many non-linearities and other functions are in torch.nn.functional - return F.log_softmax(self.linear(bow_vec)) - - -def make_bow_vector(sentence, word_to_ix): - vec = torch.zeros(len(word_to_ix)) - for word in sentence: - vec[word_to_ix[word]] += 1 - return vec.view(1, -1) - - -def make_target(label, label_to_ix): - return torch.LongTensor([label_to_ix[label]]) - -model = BoWClassifier(NUM_LABELS, VOCAB_SIZE) - -# the model knows its parameters. The first output below is A, the second is b. -# Whenever you assign a component to a class variable in the __init__ function -# of a module, which was done with the line -# self.linear = nn.Linear(...) -# Then through some Python magic from the Pytorch devs, your module -#(in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters -for param in model.parameters(): - print(param) - -# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable -sample = data[0] -bow_vector = make_bow_vector(sample[0], word_to_ix) -log_probs = model(autograd.Variable(bow_vector)) -print(log_probs) - - -###################################################################### -# Which of the above values corresponds to the log probability of ENGLISH, -# and which to SPANISH? We never defined it, but we need to if we want to -# train the thing. -# - -label_to_ix = {"SPANISH": 0, "ENGLISH": 1} - - -###################################################################### -# So lets train! To do this, we pass instances through to get log -# probabilities, compute a loss function, compute the gradient of the loss -# function, and then update the parameters with a gradient step. Loss -# functions are provided by Torch in the nn package. nn.NLLLoss() is the -# negative log likelihood loss we want. It also defines optimization -# functions in torch.optim. Here, we will just use SGD. -# -# Note that the *input* to NLLLoss is a vector of log probabilities, and a -# target label. It doesn't compute the log probabilities for us. This is -# why the last layer of our network is log softmax. The loss function -# nn.CrossEntropyLoss() is the same as NLLLoss(), except it does the log -# softmax for you. -# - -# Run on test data before we train, just to see a before-and-after -for instance, label in test_data: - bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) - log_probs = model(bow_vec) - print(log_probs) - -# Print the matrix column corresponding to "creo" -print(next(model.parameters())[:, word_to_ix["creo"]]) - -loss_function = nn.NLLLoss() -optimizer = optim.SGD(model.parameters(), lr=0.1) - -# Usually you want to pass over the training data several times. -# 100 is much bigger than on a real data set, but real datasets have more than -# two instances. Usually, somewhere between 5 and 30 epochs is reasonable. -for epoch in range(100): - for instance, label in data: - # Step 1. Remember that Pytorch accumulates gradients. - # We need to clear them out before each instance - model.zero_grad() - - # Step 2. Make our BOW vector and also we must wrap the target in a - # Variable as an integer. For example, if the target is SPANISH, then - # we wrap the integer 0. The loss function then knows that the 0th - # element of the log probabilities is the log probability - # corresponding to SPANISH - bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) - target = autograd.Variable(make_target(label, label_to_ix)) - - # Step 3. Run our forward pass. - log_probs = model(bow_vec) - - # Step 4. Compute the loss, gradients, and update the parameters by - # calling optimizer.step() - loss = loss_function(log_probs, target) - loss.backward() - optimizer.step() - -for instance, label in test_data: - bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) - log_probs = model(bow_vec) - print(log_probs) - -# Index corresponding to Spanish goes up, English goes down! -print(next(model.parameters())[:, word_to_ix["creo"]]) - - -###################################################################### -# We got the right answer! You can see that the log probability for -# Spanish is much higher in the first example, and the log probability for -# English is much higher in the second for the test data, as it should be. -# -# Now you see how to make a Pytorch component, pass some data through it -# and do gradient updates. We are ready to dig deeper into what deep NLP -# has to offer. -# - - -###################################################################### -# 6. Word Embeddings: Encoding Lexical Semantics -# ============================================== -# - - -###################################################################### -# Word embeddings are dense vectors of real numbers, one per word in your -# vocabulary. In NLP, it is almost always the case that your features are -# words! But how should you represent a word in a computer? You could -# store its ascii character representation, but that only tells you what -# the word *is*, it doesn't say much about what it *means* (you might be -# able to derive its part of speech from its affixes, or properties from -# its capitalization, but not much). Even more, in what sense could you -# combine these representations? We often want dense outputs from our -# neural networks, where the inputs are :math:`|V|` dimensional, where -# :math:`V` is our vocabulary, but often the outputs are only a few -# dimensional (if we are only predicting a handful of labels, for -# instance). How do we get from a massive dimensional space to a smaller -# dimensional space? -# -# How about instead of ascii representations, we use a one-hot encoding? -# That is, we represent the word :math:`w` by -# -# .. math:: \overbrace{\left[ 0, 0, \dots, 1, \dots, 0, 0 \right]}^\text{|V| elements} -# -# where the 1 is in a location unique to :math:`w`. Any other word will -# have a 1 in some other location, and a 0 everywhere else. -# -# There is an enormous drawback to this representation, besides just how -# huge it is. It basically treats all words as independent entities with -# no relation to each other. What we really want is some notion of -# *similarity* between words. Why? Let's see an example. -# - - -###################################################################### -# Suppose we are building a language model. Suppose we have seen the -# sentences -# -# * The mathematician ran to the store. -# * The physicist ran to the store. -# * The mathematician solved the open problem. -# -# in our training data. Now suppose we get a new sentence never before -# seen in our training data: -# -# * The physicist solved the open problem. -# -# Our language model might do OK on this sentence, but wouldn't it be much -# better if we could use the following two facts: -# -# * We have seen mathematician and physicist in the same role in a sentence. Somehow they -# have a semantic relation. -# * We have seen mathematician in the same role in this new unseen sentence -# as we are now seeing physicist. -# -# and then infer that physicist is actually a good fit in the new unseen -# sentence? This is what we mean by a notion of similarity: we mean -# *semantic similarity*, not simply having similar orthographic -# representations. It is a technique to combat the sparsity of linguistic -# data, by connecting the dots between what we have seen and what we -# haven't. This example of course relies on a fundamental linguistic -# assumption: that words appearing in similar contexts are related to each -# other semantically. This is called the `distributional -# hypothesis `__. -# - - -###################################################################### -# Getting Dense Word Embeddings -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# How can we solve this problem? That is, how could we actually encode -# semantic similarity in words? Maybe we think up some semantic -# attributes. For example, we see that both mathematicians and physicists -# can run, so maybe we give these words a high score for the "is able to -# run" semantic attribute. Think of some other attributes, and imagine -# what you might score some common words on those attributes. -# -# If each attribute is a dimension, then we might give each word a vector, -# like this: -# -# .. math:: -# -# q_\text{mathematician} = \left[ \overbrace{2.3}^\text{can run}, -# \overbrace{9.4}^\text{likes coffee}, \overbrace{-5.5}^\text{majored in Physics}, \dots \right] -# -# .. math:: -# -# q_\text{physicist} = \left[ \overbrace{2.5}^\text{can run}, -# \overbrace{9.1}^\text{likes coffee}, \overbrace{6.4}^\text{majored in Physics}, \dots \right] -# -# Then we can get a measure of similarity between these words by doing: -# -# .. math:: \text{Similarity}(\text{physicist}, \text{mathematician}) = q_\text{physicist} \cdot q_\text{mathematician} -# -# Although it is more common to normalize by the lengths: -# -# .. math:: -# -# \text{Similarity}(\text{physicist}, \text{mathematician}) = \frac{q_\text{physicist} \cdot q_\text{mathematician}} -# {\| q_\text{\physicist} \| \| q_\text{mathematician} \|} = \cos (\phi) -# -# Where :math:`\phi` is the angle between the two vectors. That way, -# extremely similar words (words whose embeddings point in the same -# direction) will have similarity 1. Extremely dissimilar words should -# have similarity -1. -# - - -###################################################################### -# You can think of the sparse one-hot vectors from the beginning of this -# section as a special case of these new vectors we have defined, where -# each word basically has similarity 0, and we gave each word some unique -# semantic attribute. These new vectors are *dense*, which is to say their -# entries are (typically) non-zero. -# -# But these new vectors are a big pain: you could think of thousands of -# different semantic attributes that might be relevant to determining -# similarity, and how on earth would you set the values of the different -# attributes? Central to the idea of deep learning is that the neural -# network learns representations of the features, rather than requiring -# the programmer to design them herself. So why not just let the word -# embeddings be parameters in our model, and then be updated during -# training? This is exactly what we will do. We will have some *latent -# semantic attributes* that the network can, in principle, learn. Note -# that the word embeddings will probably not be interpretable. That is, -# although with our hand-crafted vectors above we can see that -# mathematicians and physicists are similar in that they both like coffee, -# if we allow a neural network to learn the embeddings and see that both -# mathematicians and physicisits have a large value in the second -# dimension, it is not clear what that means. They are similar in some -# latent semantic dimension, but this probably has no interpretation to -# us. -# - - -###################################################################### -# In summary, **word embeddings are a representation of the *semantics* of -# a word, efficiently encoding semantic information that might be relevant -# to the task at hand**. You can embed other things too: part of speech -# tags, parse trees, anything! The idea of feature embeddings is central -# to the field. -# - - -###################################################################### -# Word Embeddings in Pytorch -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Before we get to a worked example and an exercise, a few quick notes -# about how to use embeddings in Pytorch and in deep learning programming -# in general. Similar to how we defined a unique index for each word when -# making one-hot vectors, we also need to define an index for each word -# when using embeddings. These will be keys into a lookup table. That is, -# embeddings are stored as a :math:`|V| \times D` matrix, where :math:`D` -# is the dimensionality of the embeddings, such that the word assigned -# index :math:`i` has its embedding stored in the :math:`i`'th row of the -# matrix. In all of my code, the mapping from words to indices is a -# dictionary named word\_to\_ix. -# -# The module that allows you to use embeddings is torch.nn.Embedding, -# which takes two arguments: the vocabulary size, and the dimensionality -# of the embeddings. -# -# To index into this table, you must use torch.LongTensor (since the -# indices are integers, not floats). -# - -word_to_ix = {"hello": 0, "world": 1} -embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings -lookup_tensor = torch.LongTensor([word_to_ix["hello"]]) -hello_embed = embeds(autograd.Variable(lookup_tensor)) -print(hello_embed) - - -###################################################################### -# An Example: N-Gram Language Modeling -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Recall that in an n-gram language model, given a sequence of words -# :math:`w`, we want to compute -# -# .. math:: P(w_i | w_{i-1}, w_{i-2}, \dots, w_{i-n+1} ) -# -# Where :math:`w_i` is the ith word of the sequence. -# -# In this example, we will compute the loss function on some training -# examples and update the parameters with backpropagation. -# - -CONTEXT_SIZE = 2 -EMBEDDING_DIM = 10 -# We will use Shakespeare Sonnet 2 -test_sentence = """When forty winters shall besiege thy brow, -And dig deep trenches in thy beauty's field, -Thy youth's proud livery so gazed on now, -Will be a totter'd weed of small worth held: -Then being asked, where all thy beauty lies, -Where all the treasure of thy lusty days; -To say, within thine own deep sunken eyes, -Were an all-eating shame, and thriftless praise. -How much more praise deserv'd thy beauty's use, -If thou couldst answer 'This fair child of mine -Shall sum my count, and make my old excuse,' -Proving his beauty by succession thine! -This were to be new made when thou art old, -And see thy blood warm when thou feel'st it cold.""".split() -# we should tokenize the input, but we will ignore that for now -# build a list of tuples. Each tuple is ([ word_i-2, word_i-1 ], target word) -trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) - for i in range(len(test_sentence) - 2)] -# print the first 3, just so you can see what they look like -print(trigrams[:3]) - -vocab = set(test_sentence) -word_to_ix = {word: i for i, word in enumerate(vocab)} - - -class NGramLanguageModeler(nn.Module): - - def __init__(self, vocab_size, embedding_dim, context_size): - super(NGramLanguageModeler, self).__init__() - self.embeddings = nn.Embedding(vocab_size, embedding_dim) - self.linear1 = nn.Linear(context_size * embedding_dim, 128) - self.linear2 = nn.Linear(128, vocab_size) - - def forward(self, inputs): - embeds = self.embeddings(inputs).view((1, -1)) - out = F.relu(self.linear1(embeds)) - out = self.linear2(out) - log_probs = F.log_softmax(out) - return log_probs - -losses = [] -loss_function = nn.NLLLoss() -model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) -optimizer = optim.SGD(model.parameters(), lr=0.001) - -for epoch in range(10): - total_loss = torch.Tensor([0]) - for context, target in trigrams: - - # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words - # into integer indices and wrap them in variables) - context_idxs = [word_to_ix[w] for w in context] - context_var = autograd.Variable(torch.LongTensor(context_idxs)) - - # Step 2. Recall that torch *accumulates* gradients. Before passing in a new instance, - # you need to zero out the gradients from the old instance - model.zero_grad() - - # Step 3. Run the forward pass, getting log probabilities over next - # words - log_probs = model(context_var) - - # Step 4. Compute your loss function. (Again, Torch wants the target - # word wrapped in a variable) - loss = loss_function(log_probs, autograd.Variable( - torch.LongTensor([word_to_ix[target]]))) - - # Step 5. Do the backward pass and update the gradient - loss.backward() - optimizer.step() - - total_loss += loss.data - losses.append(total_loss) -print(losses) # The loss decreased every iteration over the training data! - - -###################################################################### -# Exercise: Computing Word Embeddings: Continuous Bag-of-Words -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep -# learning. It is a model that tries to predict words given the context of -# a few words before and a few words after the target word. This is -# distinct from language modeling, since CBOW is not sequential and does -# not have to be probabilistic. Typcially, CBOW is used to quickly train -# word embeddings, and these embeddings are used to initialize the -# embeddings of some more complicated model. Usually, this is referred to -# as *pretraining embeddings*. It almost always helps performance a couple -# of percent. -# -# The CBOW model is as follows. Given a target word :math:`w_i` and an -# :math:`N` context window on each side, :math:`w_{i-1}, \dots, w_{i-N}` -# and :math:`w_{i+1}, \dots, w_{i+N}`, referring to all context words -# collectively as :math:`C`, CBOW tries to minimize -# -# .. math:: -\log p(w_i | C) = \log \text{Softmax}(A(\sum_{w \in C} q_w) + b) -# -# where :math:`q_w` is the embedding for word :math:`w`. -# -# Implement this model in Pytorch by filling in the class below. Some -# tips: -# -# * Think about which parameters you need to define. -# * Make sure you know what shape each operation expects. Use .view() if you need to -# reshape. -# - -CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right -raw_text = """We are about to study the idea of a computational process. -Computational processes are abstract beings that inhabit computers. -As they evolve, processes manipulate other abstract things called data. -The evolution of a process is directed by a pattern of rules -called a program. People create programs to direct processes. In effect, -we conjure the spirits of the computer with our spells.""".split() -word_to_ix = {word: i for i, word in enumerate(raw_text)} -data = [] -for i in range(2, len(raw_text) - 2): - context = [raw_text[i - 2], raw_text[i - 1], - raw_text[i + 1], raw_text[i + 2]] - target = raw_text[i] - data.append((context, target)) -print(data[:5]) - - -class CBOW(nn.Module): - - def __init__(self): - pass - - def forward(self, inputs): - pass - -# create your model and train. here are some functions to help you make -# the data ready for use by your module - - -def make_context_vector(context, word_to_ix): - idxs = [word_to_ix[w] for w in context] - tensor = torch.LongTensor(idxs) - return autograd.Variable(tensor) - -make_context_vector(data[0][0], word_to_ix) # example - - -###################################################################### -# 7. Sequence Models and Long-Short Term Memory Networks -# ====================================================== -# - - -###################################################################### -# At this point, we have seen various feed-forward networks. That is, -# there is no state maintained by the network at all. This might not be -# the behavior we want. Sequence models are central to NLP: they are -# models where there is some sort of dependence through time between your -# inputs. The classical example of a sequence model is the Hidden Markov -# Model for part-of-speech tagging. Another example is the conditional -# random field. -# -# A recurrent neural network is a network that maintains some kind of -# state. For example, its output could be used as part of the next input, -# so that information can propogate along as the network passes over the -# sequence. In the case of an LSTM, for each element in the sequence, -# there is a corresponding *hidden state* :math:`h_t`, which in principle -# can contain information from arbitrary points earlier in the sequence. -# We can use the hidden state to predict words in a language model, -# part-of-speech tags, and a myriad of other things. -# - - -###################################################################### -# LSTM's in Pytorch -# ~~~~~~~~~~~~~~~~~ -# -# Before getting to the example, note a few things. Pytorch's LSTM expects -# all of its inputs to be 3D tensors. The semantics of the axes of these -# tensors is important. The first axis is the sequence itself, the second -# indexes instances in the mini-batch, and the third indexes elements of -# the input. We haven't discussed mini-batching, so lets just ignore that -# and assume we will always have just 1 dimension on the second axis. If -# we want to run the sequence model over the sentence "The cow jumped", -# our input should look like -# -# .. math:: -# -# -# \begin{bmatrix} -# \overbrace{q_\text{The}}^\text{row vector} \\ -# q_\text{cow} \\ -# q_\text{jumped} -# \end{bmatrix} -# -# Except remember there is an additional 2nd dimension with size 1. -# -# In addition, you could go through the sequence one at a time, in which -# case the 1st axis will have size 1 also. -# -# Let's see a quick example. -# - -lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3 -inputs = [autograd.Variable(torch.randn((1, 3))) - for _ in range(5)] # make a sequence of length 5 - -# initialize the hidden state. -hidden = (autograd.Variable(torch.randn(1, 1, 3)), - autograd.Variable(torch.randn((1, 1, 3)))) -for i in inputs: - # Step through the sequence one element at a time. - # after each step, hidden contains the hidden state. - out, hidden = lstm(i.view(1, 1, -1), hidden) - -# alternatively, we can do the entire sequence all at once. -# the first value returned by LSTM is all of the hidden states throughout -# the sequence. the second is just the most recent hidden state -# (compare the last slice of "out" with "hidden" below, they are the same) -# The reason for this is that: -# "out" will give you access to all hidden states in the sequence -# "hidden" will allow you to continue the sequence and backpropogate, -# by passing it as an argument to the lstm at a later time -# Add the extra 2nd dimension -inputs = torch.cat(inputs).view(len(inputs), 1, -1) -hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable( - torch.randn((1, 1, 3)))) # clean out hidden state -out, hidden = lstm(inputs, hidden) -print(out) -print(hidden) - - -###################################################################### -# Example: An LSTM for Part-of-Speech Tagging -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# In this section, we will use an LSTM to get part of speech tags. We will -# not use Viterbi or Forward-Backward or anything like that, but as a -# (challenging) exercise to the reader, think about how Viterbi could be -# used after you have seen what is going on. -# -# The model is as follows: let our input sentence be -# :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let -# :math:`T` be our tag set, and :math:`y_i` the tag of word :math:`w_i`. -# Denote our prediction of the tag of word :math:`w_i` by -# :math:`\hat{y}_i`. -# -# This is a structure prediction, model, where our output is a sequence -# :math:`\hat{y}_1, \dots, \hat{y}_M`, where :math:`\hat{y}_i \in T`. -# -# To do the prediction, pass an LSTM over the sentence. Denote the hidden -# state at timestep :math:`i` as :math:`h_i`. Also, assign each tag a -# unique index (like how we had word\_to\_ix in the word embeddings -# section). Then our prediction rule for :math:`\hat{y}_i` is -# -# .. math:: \hat{y}_i = \text{argmax}_j \ (\log \text{Softmax}(Ah_i + b))_j -# -# That is, take the log softmax of the affine map of the hidden state, -# and the predicted tag is the tag that has the maximum value in this -# vector. Note this implies immediately that the dimensionality of the -# target space of :math:`A` is :math:`|T|`. -# - -def prepare_sequence(seq, to_ix): - idxs = [to_ix[w] for w in seq] - tensor = torch.LongTensor(idxs) - return autograd.Variable(tensor) - -training_data = [ - ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]), - ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]) -] -word_to_ix = {} -for sent, tags in training_data: - for word in sent: - if word not in word_to_ix: - word_to_ix[word] = len(word_to_ix) -print(word_to_ix) -tag_to_ix = {"DET": 0, "NN": 1, "V": 2} - -# These will usually be more like 32 or 64 dimensional. -# We will keep them small, so we can see how the weights change as we train. -EMBEDDING_DIM = 6 -HIDDEN_DIM = 6 - - -class LSTMTagger(nn.Module): - - def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size): - super(LSTMTagger, self).__init__() - self.hidden_dim = hidden_dim - - self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) - - # The LSTM takes word embeddings as inputs, and outputs hidden states - # with dimensionality hidden_dim. - self.lstm = nn.LSTM(embedding_dim, hidden_dim) - - # The linear layer that maps from hidden state space to tag space - self.hidden2tag = nn.Linear(hidden_dim, tagset_size) - self.hidden = self.init_hidden() - - def init_hidden(self): - # Before we've done anything, we dont have any hidden state. - # Refer to the Pytorch documentation to see exactly why they have this dimensionality. - # The axes semantics are (num_layers, minibatch_size, hidden_dim) - return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), - autograd.Variable(torch.zeros(1, 1, self.hidden_dim))) - - def forward(self, sentence): - embeds = self.word_embeddings(sentence) - lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1)) - tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) - tag_scores = F.log_softmax(tag_space) - return tag_scores - -model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)) -loss_function = nn.NLLLoss() -optimizer = optim.SGD(model.parameters(), lr=0.1) - -# See what the scores are before training -# Note that element i,j of the output is the score for tag j for word i. -inputs = prepare_sequence(training_data[0][0], word_to_ix) -tag_scores = model(inputs) -print(tag_scores) - -for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data - for sentence, tags in training_data: - # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out - # before each instance - model.zero_grad() - - # Also, we need to clear out the hidden state of the LSTM, detaching it from its - # history on the last instance. - model.hidden = model.init_hidden() - - # Step 2. Get our inputs ready for the network, that is, turn them into Variables - # of word indices. - sentence_in = prepare_sequence(sentence, word_to_ix) - targets = prepare_sequence(tags, tag_to_ix) - - # Step 3. Run our forward pass. - tag_scores = model(sentence_in) - - # Step 4. Compute the loss, gradients, and update the parameters by calling - # optimizer.step() - loss = loss_function(tag_scores, targets) - loss.backward() - optimizer.step() - -# See what the scores are after training -inputs = prepare_sequence(training_data[0][0], word_to_ix) -tag_scores = model(inputs) -# The sentence is "the dog ate the apple". i,j corresponds to score for tag j for word i. -# The predicted tag is the maximum scoring tag. -# Here, we can see the predicted sequence below is 0 1 2 0 1 -# since 0 is index of the maximum value of row 1, -# 1 is the index of maximum value of row 2, etc. -# Which is DET NOUN VERB DET NOUN, the correct sequence! -print(tag_scores) - - -###################################################################### -# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# In the example above, each word had an embedding, which served as the -# inputs to our sequence model. Let's augment the word embeddings with a -# representation derived from the characters of the word. We expect that -# this should help significantly, since character-level information like -# affixes have a large bearing on part-of-speech. For example, words with -# the affix *-ly* are almost always tagged as adverbs in English. -# -# Do do this, let :math:`c_w` be the character-level representation of -# word :math:`w`. Let :math:`x_w` be the word embedding as before. Then -# the input to our sequence model is the concatenation of :math:`x_w` and -# :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w` -# dimension 3, then our LSTM should accept an input of dimension 8. -# -# To get the character level representation, do an LSTM over the -# characters of a word, and let :math:`c_w` be the final hidden state of -# this LSTM. Hints: -# * There are going to be two LSTM's in your new model. -# The original one that outputs POS tag scores, and the new one that -# outputs a character-level representation of each word. -# * To do a sequence model over characters, you will have to embed characters. -# The character embeddings will be the input to the character LSTM. -# - - -###################################################################### -# 8. Advanced: Making Dynamic Decisions and the Bi-LSTM CRF -# ========================================================= -# - - -###################################################################### -# Dyanmic versus Static Deep Learning Toolkits -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Pytorch is a *dynamic* neural network kit. Another example of a dynamic -# kit is `Dynet `__ (I mention this because -# working with Pytorch and Dynet is similar. If you see an example in -# Dynet, it will probably help you implement it in Pytorch). The opposite -# is the *static* tool kit, which includes Theano, Keras, TensorFlow, etc. -# The core difference is the following: -# -# * In a static toolkit, you define -# a computation graph once, compile it, and then stream instances to it. -# * In a dynamic toolkit, you define a computation graph *for each -# instance*. It is never compiled and is executed on-the-fly -# -# Without a lot of experience, it is difficult to appreciate the -# difference. One example is to suppose we want to build a deep -# constituent parser. Suppose our model involves roughly the following -# steps: -# -# * We build the tree bottom up -# * Tag the root nodes (the words of the sentence) -# * From there, use a neural network and the embeddings -# -# of the words to find combinations that form constituents. Whenever you -# form a new constituent, use some sort of technique to get an embedding -# of the constituent. In this case, our network architecture will depend -# completely on the input sentence. In the sentence "The green cat -# scratched the wall", at some point in the model, we will want to combine -# the span :math:`(i,j,r) = (1, 3, \text{NP})` (that is, an NP constituent -# spans word 1 to word 3, in this case "The green cat"). -# -# However, another sentence might be "Somewhere, the big fat cat scratched -# the wall". In this sentence, we will want to form the constituent -# :math:`(2, 4, NP)` at some point. The constituents we will want to form -# will depend on the instance. If we just compile the computation graph -# once, as in a static toolkit, it will be exceptionally difficult or -# impossible to program this logic. In a dynamic toolkit though, there -# isn't just 1 pre-defined computation graph. There can be a new -# computation graph for each instance, so this problem goes away. -# -# Dynamic toolkits also have the advantage of being easier to debug and -# the code more closely resembling the host language (by that I mean that -# Pytorch and Dynet look more like actual Python code than Keras or -# Theano). -# - - -###################################################################### -# Bi-LSTM Conditional Random Field Discussion -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this section, we will see a full, complicated example of a Bi-LSTM -# Conditional Random Field for named-entity recognition. The LSTM tagger -# above is typically sufficient for part-of-speech tagging, but a sequence -# model like the CRF is really essential for strong performance on NER. -# Familiarity with CRF's is assumed. Although this name sounds scary, all -# the model is is a CRF but where an LSTM provides the features. This is -# an advanced model though, far more complicated than any earlier model in -# this tutorial. If you want to skip it, that is fine. To see if you're -# ready, see if you can: -# -# - Write the recurrence for the viterbi variable at step i for tag k. -# - Modify the above recurrence to compute the forward variables instead. -# - Modify again the above recurrence to compute the forward variables in -# log-space (hint: log-sum-exp) -# -# If you can do those three things, you should be able to understand the -# code below. Recall that the CRF computes a conditional probability. Let -# :math:`y` be a tag sequence and :math:`x` an input sequence of words. -# Then we compute -# -# .. math:: P(y|x) = \frac{\exp{(\text{Score}(x, y)})}{\sum_{y'} \exp{(\text{Score}(x, y')})} -# -# Where the score is determined by defining some log potentials -# :math:`\log \psi_i(x,y)` such that -# -# .. math:: \text{Score}(x,y) = \sum_i \log \psi_i(x,y) -# -# To make the partition function tractable, the potentials must look only -# at local features. -# -# In the Bi-LSTM CRF, we define two kinds of potentials: emission and -# transition. The emission potential for the word at index :math:`i` comes -# from the hidden state of the Bi-LSTM at timestep :math:`i`. The -# transition scores are stored in a :math:`|T|x|T|` matrix -# :math:`\textbf{P}`, where :math:`T` is the tag set. In my -# implementation, :math:`\textbf{P}_{j,k}` is the score of transitioning -# to tag :math:`j` from tag :math:`k`. So: -# -# .. math:: \text{Score}(x,y) = \sum_i \log \psi_\text{EMIT}(y_i \rightarrow x_i) + \log \psi_\text{TRANS}(y_{i-1} \rightarrow y_i) -# -# .. math:: = \sum_i h_i[y_i] + \textbf{P}_{y_i, y_{i-1}} -# -# where in this second expression, we think of the tags as being assigned -# unique non-negative indices. -# -# If the above discussion was too brief, you can check out -# `this `__ write up from -# Michael Collins on CRFs. -# -# Implementation Notes -# ~~~~~~~~~~~~~~~~~~~~ -# -# The example below implements the forward algorithm in log space to -# compute the partition function, and the viterbi algorithm to decode. -# Backpropagation will compute the gradients automatically for us. We -# don't have to do anything by hand. -# -# The implementation is not optimized. If you understand what is going on, -# you'll probably quickly see that iterating over the next tag in the -# forward algorithm could probably be done in one big operation. I wanted -# to code to be more readable. If you want to make the relevant change, -# you could probably use this tagger for real tasks. -# - -# Helper functions to make the code more readable. -def to_scalar(var): - # returns a python float - return var.view(-1).data.tolist()[0] - - -def argmax(vec): - # return the argmax as a python int - _, idx = torch.max(vec, 1) - return to_scalar(idx) - -# Compute log sum exp in a numerically stable way for the forward algorithm - - -def log_sum_exp(vec): - max_score = vec[0, argmax(vec)] - max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) - return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) - - -class BiLSTM_CRF(nn.Module): - - def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim): - super(BiLSTM_CRF, self).__init__() - self.embedding_dim = embedding_dim - self.hidden_dim = hidden_dim - self.vocab_size = vocab_size - self.tag_to_ix = tag_to_ix - self.tagset_size = len(tag_to_ix) - - self.word_embeds = nn.Embedding(vocab_size, embedding_dim) - self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, - num_layers=1, bidirectional=True) - - # Maps the output of the LSTM into tag space. - self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) - - # Matrix of transition parameters. Entry i,j is the score of - # transitioning *to* i *from* j. - self.transitions = nn.Parameter( - torch.randn(self.tagset_size, self.tagset_size)) - - self.hidden = self.init_hidden() - - def init_hidden(self): - return (autograd.Variable(torch.randn(2, 1, self.hidden_dim)), - autograd.Variable(torch.randn(2, 1, self.hidden_dim))) - - def _forward_alg(self, feats): - # Do the forward algorithm to compute the partition function - init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.) - # START_TAG has all of the score. - init_alphas[0][self.tag_to_ix[START_TAG]] = 0. - - # Wrap in a variable so that we will get automatic backprop - forward_var = autograd.Variable(init_alphas) - - # Iterate through the sentence - for feat in feats: - alphas_t = [] # The forward variables at this timestep - for next_tag in range(self.tagset_size): - # broadcast the emission score: it is the same regardless of - # the previous tag - emit_score = feat[next_tag].view( - 1, -1).expand(1, self.tagset_size) - # the ith entry of trans_score is the score of transitioning to - # next_tag from i - trans_score = self.transitions[next_tag].view(1, -1) - # The ith entry of next_tag_var is the value for the edge (i -> next_tag) - # before we do log-sum-exp - next_tag_var = forward_var + trans_score + emit_score - # The forward variable for this tag is log-sum-exp of all the - # scores. - alphas_t.append(log_sum_exp(next_tag_var)) - forward_var = torch.cat(alphas_t).view(1, -1) - terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] - alpha = log_sum_exp(terminal_var) - return alpha - - def _get_lstm_features(self, sentence): - self.hidden = self.init_hidden() - embeds = self.word_embeds(sentence).view(len(sentence), 1, -1) - lstm_out, self.hidden = self.lstm(embeds) - lstm_out = lstm_out.view(len(sentence), self.hidden_dim) - lstm_feats = self.hidden2tag(lstm_out) - return lstm_feats - - def _score_sentence(self, feats, tags): - # Gives the score of a provided tag sequence - score = autograd.Variable(torch.Tensor([0])) - tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags]) - for i, feat in enumerate(feats): - score = score + \ - self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]] - score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]] - return score - - def _viterbi_decode(self, feats): - backpointers = [] - - # Initialize the viterbi variables in log space - init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.) - init_vvars[0][self.tag_to_ix[START_TAG]] = 0 - - # forward_var at step i holds the viterbi variables for step i-1 - forward_var = autograd.Variable(init_vvars) - for feat in feats: - bptrs_t = [] # holds the backpointers for this step - viterbivars_t = [] # holds the viterbi variables for this step - - for next_tag in range(self.tagset_size): - # next_tag_var[i] holds the viterbi variable for tag i at the previous step, - # plus the score of transitioning from tag i to next_tag. - # We don't include the emission scores here because the max - # does not depend on them (we add them in below) - next_tag_var = forward_var + self.transitions[next_tag] - best_tag_id = argmax(next_tag_var) - bptrs_t.append(best_tag_id) - viterbivars_t.append(next_tag_var[0][best_tag_id]) - # Now add in the emission scores, and assign forward_var to the set - # of viterbi variables we just computed - forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) - backpointers.append(bptrs_t) - - # Transition to STOP_TAG - terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] - best_tag_id = argmax(terminal_var) - path_score = terminal_var[0][best_tag_id] - - # Follow the back pointers to decode the best path. - best_path = [best_tag_id] - for bptrs_t in reversed(backpointers): - best_tag_id = bptrs_t[best_tag_id] - best_path.append(best_tag_id) - # Pop off the start tag (we dont want to return that to the caller) - start = best_path.pop() - assert start == self.tag_to_ix[START_TAG] # Sanity check - best_path.reverse() - return path_score, best_path - - def neg_log_likelihood(self, sentence, tags): - self.hidden = self.init_hidden() - feats = self._get_lstm_features(sentence) - forward_score = self._forward_alg(feats) - gold_score = self._score_sentence(feats, tags) - return forward_score - gold_score - - def forward(self, sentence): # dont confuse this with _forward_alg above. - self.hidden = self.init_hidden() - # Get the emission scores from the BiLSTM - lstm_feats = self._get_lstm_features(sentence) - - # Find the best path, given the features. - score, tag_seq = self._viterbi_decode(lstm_feats) - return score, tag_seq - - -START_TAG = "" -STOP_TAG = "" -EMBEDDING_DIM = 5 -HIDDEN_DIM = 4 - -# Make up some training data -training_data = [( - "the wall street journal reported today that apple corporation made money".split(), - "B I I I O O O B I O O".split() -), ( - "georgia tech is a university in georgia".split(), - "B I O O O O B".split() -)] - -word_to_ix = {} -for sentence, tags in training_data: - for word in sentence: - if word not in word_to_ix: - word_to_ix[word] = len(word_to_ix) - -tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4} - -model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) -optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) - -# Check predictions before training -precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) -precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]]) -print(model(precheck_sent)) - -# Make sure prepare_sequence from earlier in the LSTM section is loaded -for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data - for sentence, tags in training_data: - # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out - # before each instance - model.zero_grad() - - # Step 2. Get our inputs ready for the network, that is, turn them into Variables - # of word indices. - sentence_in = prepare_sequence(sentence, word_to_ix) - targets = torch.LongTensor([tag_to_ix[t] for t in tags]) - - # Step 3. Run our forward pass. - neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets) - - # Step 4. Compute the loss, gradients, and update the parameters by calling - # optimizer.step() - neg_log_likelihood.backward() - optimizer.step() - -# Check predictions after training -precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) -print(model(precheck_sent)) -# We got it! - - -###################################################################### -# Exercise: A new loss function for discriminative tagging -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# It wasn't really necessary for us to create a computation graph when -# doing decoding, since we do not backpropagate from the viterbi path -# score. Since we have it anyway, try training the tagger where the loss -# function is the difference between the Viterbi path score and the score -# of the gold-standard path. It should be clear that this function is -# non-negative and 0 when the predicted tag sequence is the correct tag -# sequence. This is essentially *structured perceptron*. -# -# This modification should be short, since Viterbi and score\_sentence are -# already implemented. This is an example of the shape of the computation -# graph *depending on the training instance*. Although I haven't tried -# implementing this in a static toolkit, I imagine that it is possible but -# much less straightforward. -# -# Pick up some real data and do a comparison! -# diff --git a/beginner_source/deep_learning_nlp_tutorial.rst b/beginner_source/deep_learning_nlp_tutorial.rst new file mode 100644 index 00000000000..0496007320c --- /dev/null +++ b/beginner_source/deep_learning_nlp_tutorial.rst @@ -0,0 +1,56 @@ +Deep Learning for NLP with Pytorch +********************************** +**Author**: `Robert Guthrie `_ + +This tutorial will walk you through the key ideas of deep learning +programming using Pytorch. Many of the concepts (such as the computation +graph abstraction and autograd) are not unique to Pytorch and are +relevant to any deep learning tool kit out there. + +I am writing this tutorial to focus specifically on NLP for people who +have never written code in any deep learning framework (e.g, TensorFlow, +Theano, Keras, Dynet). It assumes working knowledge of core NLP +problems: part-of-speech tagging, language modeling, etc. It also +assumes familiarity with neural networks at the level of an intro AI +class (such as one from the Russel and Norvig book). Usually, these +courses cover the basic backpropagation algorithm on feed-forward neural +networks, and make the point that they are chains of compositions of +linearities and non-linearities. This tutorial aims to get you started +writing deep learning code, given you have this prerequisite knowledge. + +Note this is about *models*, not data. For all of the models, I just +create a few test examples with small dimensionality so you can see how +the weights change as it trains. If you have some real data you want to +try, you should be able to rip out any of the models from this notebook +and use them on it. + + +.. toctree:: + :hidden: + + /beginner/nlp/pytorch_tutorial + /beginner/nlp/deep_learning_tutorial + /beginner/nlp/word_embeddings_tutorial + /beginner/nlp/sequence_models_tutorial + /beginner/nlp/advanced_tutorial + + +.. galleryitem:: beginner/nlp/pytorch_tutorial.py + :intro: All of deep learning is computations on tensors, which are generalizations of a matrix that can be + +.. galleryitem:: beginner/nlp/deep_learning_tutorial.py + :intro: Deep learning consists of composing linearities with non-linearities in clever ways. The introduction of non-linearities allows + +.. galleryitem:: beginner/nlp/word_embeddings_tutorial.py + :intro: Word embeddings are dense vectors of real numbers, one per word in your vocabulary. In NLP, it is almost always the case that your features are + +.. galleryitem:: beginner/nlp/sequence_models_tutorial.py + :intro: At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. + +.. galleryitem:: beginner/nlp/advanced_tutorial.py + :intro: Dyanmic versus Static Deep Learning Toolkits. Pytorch is a *dynamic* neural network kit. + + +.. raw:: html + +
\ No newline at end of file diff --git a/beginner_source/nlp/README.txt b/beginner_source/nlp/README.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/beginner_source/nlp/advanced_tutorial.py b/beginner_source/nlp/advanced_tutorial.py new file mode 100644 index 00000000000..cf6efa22422 --- /dev/null +++ b/beginner_source/nlp/advanced_tutorial.py @@ -0,0 +1,366 @@ +# -*- coding: utf-8 -*- +r""" +Advanced: Making Dynamic Decisions and the Bi-LSTM CRF +====================================================== + +Dyanmic versus Static Deep Learning Toolkits +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pytorch is a *dynamic* neural network kit. Another example of a dynamic +kit is `Dynet `__ (I mention this because +working with Pytorch and Dynet is similar. If you see an example in +Dynet, it will probably help you implement it in Pytorch). The opposite +is the *static* tool kit, which includes Theano, Keras, TensorFlow, etc. +The core difference is the following: + +* In a static toolkit, you define + a computation graph once, compile it, and then stream instances to it. +* In a dynamic toolkit, you define a computation graph *for each + instance*. It is never compiled and is executed on-the-fly + +Without a lot of experience, it is difficult to appreciate the +difference. One example is to suppose we want to build a deep +constituent parser. Suppose our model involves roughly the following +steps: + +* We build the tree bottom up +* Tag the root nodes (the words of the sentence) +* From there, use a neural network and the embeddings + +of the words to find combinations that form constituents. Whenever you +form a new constituent, use some sort of technique to get an embedding +of the constituent. In this case, our network architecture will depend +completely on the input sentence. In the sentence "The green cat +scratched the wall", at some point in the model, we will want to combine +the span :math:`(i,j,r) = (1, 3, \text{NP})` (that is, an NP constituent +spans word 1 to word 3, in this case "The green cat"). + +However, another sentence might be "Somewhere, the big fat cat scratched +the wall". In this sentence, we will want to form the constituent +:math:`(2, 4, NP)` at some point. The constituents we will want to form +will depend on the instance. If we just compile the computation graph +once, as in a static toolkit, it will be exceptionally difficult or +impossible to program this logic. In a dynamic toolkit though, there +isn't just 1 pre-defined computation graph. There can be a new +computation graph for each instance, so this problem goes away. + +Dynamic toolkits also have the advantage of being easier to debug and +the code more closely resembling the host language (by that I mean that +Pytorch and Dynet look more like actual Python code than Keras or +Theano). +""" + + +##################################################################### +# Bi-LSTM Conditional Random Field Discussion +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For this section, we will see a full, complicated example of a Bi-LSTM +# Conditional Random Field for named-entity recognition. The LSTM tagger +# above is typically sufficient for part-of-speech tagging, but a sequence +# model like the CRF is really essential for strong performance on NER. +# Familiarity with CRF's is assumed. Although this name sounds scary, all +# the model is is a CRF but where an LSTM provides the features. This is +# an advanced model though, far more complicated than any earlier model in +# this tutorial. If you want to skip it, that is fine. To see if you're +# ready, see if you can: +# +# - Write the recurrence for the viterbi variable at step i for tag k. +# - Modify the above recurrence to compute the forward variables instead. +# - Modify again the above recurrence to compute the forward variables in + # log-space (hint: log-sum-exp) +# +# If you can do those three things, you should be able to understand the +# code below. Recall that the CRF computes a conditional probability. Let +# :math:`y` be a tag sequence and :math:`x` an input sequence of words. +# Then we compute +# +# .. math:: P(y|x) = \frac{\exp{(\text{Score}(x, y)})}{\sum_{y'} \exp{(\text{Score}(x, y')})} +# +# Where the score is determined by defining some log potentials +# :math:`\log \psi_i(x,y)` such that +# +# .. math:: \text{Score}(x,y) = \sum_i \log \psi_i(x,y) +# +# To make the partition function tractable, the potentials must look only +# at local features. +# +# In the Bi-LSTM CRF, we define two kinds of potentials: emission and +# transition. The emission potential for the word at index :math:`i` comes +# from the hidden state of the Bi-LSTM at timestep :math:`i`. The +# transition scores are stored in a :math:`|T|x|T|` matrix +# :math:`\textbf{P}`, where :math:`T` is the tag set. In my +# implementation, :math:`\textbf{P}_{j,k}` is the score of transitioning +# to tag :math:`j` from tag :math:`k`. So: +# +# .. math:: \text{Score}(x,y) = \sum_i \log \psi_\text{EMIT}(y_i \rightarrow x_i) + \log \psi_\text{TRANS}(y_{i-1} \rightarrow y_i) +# +# .. math:: = \sum_i h_i[y_i] + \textbf{P}_{y_i, y_{i-1}} +# +# where in this second expression, we think of the tags as being assigned +# unique non-negative indices. +# +# If the above discussion was too brief, you can check out +# `this `__ write up from +# Michael Collins on CRFs. +# +# Implementation Notes +# ~~~~~~~~~~~~~~~~~~~~ +# +# The example below implements the forward algorithm in log space to +# compute the partition function, and the viterbi algorithm to decode. +# Backpropagation will compute the gradients automatically for us. We +# don't have to do anything by hand. +# +# The implementation is not optimized. If you understand what is going on, +# you'll probably quickly see that iterating over the next tag in the +# forward algorithm could probably be done in one big operation. I wanted +# to code to be more readable. If you want to make the relevant change, +# you could probably use this tagger for real tasks. +##################################################################### +# Author: Robert Guthrie + +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.optim as optim +from sequence_models_tutorial import prepare_sequence + +torch.manual_seed(1) + +##################################################################### + +# Helper functions to make the code more readable. +def to_scalar(var): + # returns a python float + return var.view(-1).data.tolist()[0] + + +def argmax(vec): + # return the argmax as a python int + _, idx = torch.max(vec, 1) + return to_scalar(idx) + +# Compute log sum exp in a numerically stable way for the forward algorithm + + +def log_sum_exp(vec): + max_score = vec[0, argmax(vec)] + max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) + return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) + + +class BiLSTM_CRF(nn.Module): + + def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim): + super(BiLSTM_CRF, self).__init__() + self.embedding_dim = embedding_dim + self.hidden_dim = hidden_dim + self.vocab_size = vocab_size + self.tag_to_ix = tag_to_ix + self.tagset_size = len(tag_to_ix) + + self.word_embeds = nn.Embedding(vocab_size, embedding_dim) + self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, + num_layers=1, bidirectional=True) + + # Maps the output of the LSTM into tag space. + self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) + + # Matrix of transition parameters. Entry i,j is the score of + # transitioning *to* i *from* j. + self.transitions = nn.Parameter( + torch.randn(self.tagset_size, self.tagset_size)) + + self.hidden = self.init_hidden() + + def init_hidden(self): + return (autograd.Variable(torch.randn(2, 1, self.hidden_dim)), + autograd.Variable(torch.randn(2, 1, self.hidden_dim))) + + def _forward_alg(self, feats): + # Do the forward algorithm to compute the partition function + init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.) + # START_TAG has all of the score. + init_alphas[0][self.tag_to_ix[START_TAG]] = 0. + + # Wrap in a variable so that we will get automatic backprop + forward_var = autograd.Variable(init_alphas) + + # Iterate through the sentence + for feat in feats: + alphas_t = [] # The forward variables at this timestep + for next_tag in range(self.tagset_size): + # broadcast the emission score: it is the same regardless of + # the previous tag + emit_score = feat[next_tag].view( + 1, -1).expand(1, self.tagset_size) + # the ith entry of trans_score is the score of transitioning to + # next_tag from i + trans_score = self.transitions[next_tag].view(1, -1) + # The ith entry of next_tag_var is the value for the + # edge (i -> next_tag) before we do log-sum-exp + next_tag_var = forward_var + trans_score + emit_score + # The forward variable for this tag is log-sum-exp of all the + # scores. + alphas_t.append(log_sum_exp(next_tag_var)) + forward_var = torch.cat(alphas_t).view(1, -1) + terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] + alpha = log_sum_exp(terminal_var) + return alpha + + def _get_lstm_features(self, sentence): + self.hidden = self.init_hidden() + embeds = self.word_embeds(sentence).view(len(sentence), 1, -1) + lstm_out, self.hidden = self.lstm(embeds) + lstm_out = lstm_out.view(len(sentence), self.hidden_dim) + lstm_feats = self.hidden2tag(lstm_out) + return lstm_feats + + def _score_sentence(self, feats, tags): + # Gives the score of a provided tag sequence + score = autograd.Variable(torch.Tensor([0])) + tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags]) + for i, feat in enumerate(feats): + score = score + \ + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]] + score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]] + return score + + def _viterbi_decode(self, feats): + backpointers = [] + + # Initialize the viterbi variables in log space + init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.) + init_vvars[0][self.tag_to_ix[START_TAG]] = 0 + + # forward_var at step i holds the viterbi variables for step i-1 + forward_var = autograd.Variable(init_vvars) + for feat in feats: + bptrs_t = [] # holds the backpointers for this step + viterbivars_t = [] # holds the viterbi variables for this step + + for next_tag in range(self.tagset_size): + # next_tag_var[i] holds the viterbi variable for tag i at the + # previous step, plus the score of transitioning + # from tag i to next_tag. + # We don't include the emission scores here because the max + # does not depend on them (we add them in below) + next_tag_var = forward_var + self.transitions[next_tag] + best_tag_id = argmax(next_tag_var) + bptrs_t.append(best_tag_id) + viterbivars_t.append(next_tag_var[0][best_tag_id]) + # Now add in the emission scores, and assign forward_var to the set + # of viterbi variables we just computed + forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) + backpointers.append(bptrs_t) + + # Transition to STOP_TAG + terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] + best_tag_id = argmax(terminal_var) + path_score = terminal_var[0][best_tag_id] + + # Follow the back pointers to decode the best path. + best_path = [best_tag_id] + for bptrs_t in reversed(backpointers): + best_tag_id = bptrs_t[best_tag_id] + best_path.append(best_tag_id) + # Pop off the start tag (we dont want to return that to the caller) + start = best_path.pop() + assert start == self.tag_to_ix[START_TAG] # Sanity check + best_path.reverse() + return path_score, best_path + + def neg_log_likelihood(self, sentence, tags): + self.hidden = self.init_hidden() + feats = self._get_lstm_features(sentence) + forward_score = self._forward_alg(feats) + gold_score = self._score_sentence(feats, tags) + return forward_score - gold_score + + def forward(self, sentence): # dont confuse this with _forward_alg above. + self.hidden = self.init_hidden() + # Get the emission scores from the BiLSTM + lstm_feats = self._get_lstm_features(sentence) + + # Find the best path, given the features. + score, tag_seq = self._viterbi_decode(lstm_feats) + return score, tag_seq + + +START_TAG = "" +STOP_TAG = "" +EMBEDDING_DIM = 5 +HIDDEN_DIM = 4 + +# Make up some training data +training_data = [( + "the wall street journal reported today that apple corporation made money".split(), + "B I I I O O O B I O O".split() +), ( + "georgia tech is a university in georgia".split(), + "B I O O O O B".split() +)] + +word_to_ix = {} +for sentence, tags in training_data: + for word in sentence: + if word not in word_to_ix: + word_to_ix[word] = len(word_to_ix) + +tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4} + +model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) +optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) + +# Check predictions before training +precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) +precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]]) +print(model(precheck_sent)) + +# Make sure prepare_sequence from earlier in the LSTM section is loaded +for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data + for sentence, tags in training_data: + # Step 1. Remember that Pytorch accumulates gradients. + # We need to clear them out before each instance + model.zero_grad() + + # Step 2. Get our inputs ready for the network, that is, + # turn them into Variables of word indices. + sentence_in = prepare_sequence(sentence, word_to_ix) + targets = torch.LongTensor([tag_to_ix[t] for t in tags]) + + # Step 3. Run our forward pass. + neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets) + + # Step 4. Compute the loss, gradients, and update the parameters by + # calling optimizer.step() + neg_log_likelihood.backward() + optimizer.step() + +# Check predictions after training +precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) +print(model(precheck_sent)) +# We got it! + + +###################################################################### +# Exercise: A new loss function for discriminative tagging +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# It wasn't really necessary for us to create a computation graph when +# doing decoding, since we do not backpropagate from the viterbi path +# score. Since we have it anyway, try training the tagger where the loss +# function is the difference between the Viterbi path score and the score +# of the gold-standard path. It should be clear that this function is +# non-negative and 0 when the predicted tag sequence is the correct tag +# sequence. This is essentially *structured perceptron*. +# +# This modification should be short, since Viterbi and score\_sentence are +# already implemented. This is an example of the shape of the computation +# graph *depending on the training instance*. Although I haven't tried +# implementing this in a static toolkit, I imagine that it is possible but +# much less straightforward. +# +# Pick up some real data and do a comparison! +# diff --git a/beginner_source/nlp/deep_learning_tutorial.py b/beginner_source/nlp/deep_learning_tutorial.py new file mode 100644 index 00000000000..f83bf23400d --- /dev/null +++ b/beginner_source/nlp/deep_learning_tutorial.py @@ -0,0 +1,390 @@ +# -*- coding: utf-8 -*- +r""" +Deep Learning with PyTorch +************************** + +Deep Learning Building Blocks: Affine maps, non-linearities and objectives +========================================================================== + +Deep learning consists of composing linearities with non-linearities in +clever ways. The introduction of non-linearities allows for powerful +models. In this section, we will play with these core components, make +up an objective function, and see how the model is trained. + + +Affine Maps +~~~~~~~~~~~ + +One of the core workhorses of deep learning is the affine map, which is +a function :math:`f(x)` where + +.. math:: f(x) = Ax + b + +for a matrix :math:`A` and vectors :math:`x, b`. The parameters to be +learned here are :math:`A` and :math:`b`. Often, :math:`b` is refered to +as the *bias* term. + + +Pytorch and most other deep learning frameworks do things a little +differently than traditional linear algebra. It maps the rows of the +input instead of the columns. That is, the :math:`i`'th row of the +output below is the mapping of the :math:`i`'th row of the input under +:math:`A`, plus the bias term. Look at the example below. + +""" + +# Author: Robert Guthrie + +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +torch.manual_seed(1) + + +###################################################################### + +lin = nn.Linear(5, 3) # maps from R^5 to R^3, parameters A, b +# data is 2x5. A maps from 5 to 3... can we map "data" under A? +data = autograd.Variable(torch.randn(2, 5)) +print(lin(data)) # yes + + +###################################################################### +# Non-Linearities +# ~~~~~~~~~~~~~~~ +# +# First, note the following fact, which will explain why we need +# non-linearities in the first place. Suppose we have two affine maps +# :math:`f(x) = Ax + b` and :math:`g(x) = Cx + d`. What is +# :math:`f(g(x))`? +# +# .. math:: f(g(x)) = A(Cx + d) + b = ACx + (Ad + b) +# +# :math:`AC` is a matrix and :math:`Ad + b` is a vector, so we see that +# composing affine maps gives you an affine map. +# +# From this, you can see that if you wanted your neural network to be long +# chains of affine compositions, that this adds no new power to your model +# than just doing a single affine map. +# +# If we introduce non-linearities in between the affine layers, this is no +# longer the case, and we can build much more powerful models. +# +# There are a few core non-linearities. +# :math:`\tanh(x), \sigma(x), \text{ReLU}(x)` are the most common. You are +# probably wondering: "why these functions? I can think of plenty of other +# non-linearities." The reason for this is that they have gradients that +# are easy to compute, and computing gradients is essential for learning. +# For example +# +# .. math:: \frac{d\sigma}{dx} = \sigma(x)(1 - \sigma(x)) +# +# A quick note: although you may have learned some neural networks in your +# intro to AI class where :math:`\sigma(x)` was the default non-linearity, +# typically people shy away from it in practice. This is because the +# gradient *vanishes* very quickly as the absolute value of the argument +# grows. Small gradients means it is hard to learn. Most people default to +# tanh or ReLU. +# + +# In pytorch, most non-linearities are in torch.functional (we have it imported as F) +# Note that non-linearites typically don't have parameters like affine maps do. +# That is, they don't have weights that are updated during training. +data = autograd.Variable(torch.randn(2, 2)) +print(data) +print(F.relu(data)) + + +###################################################################### +# Softmax and Probabilities +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The function :math:`\text{Softmax}(x)` is also just a non-linearity, but +# it is special in that it usually is the last operation done in a +# network. This is because it takes in a vector of real numbers and +# returns a probability distribution. Its definition is as follows. Let +# :math:`x` be a vector of real numbers (positive, negative, whatever, +# there are no constraints). Then the i'th component of +# :math:`\text{Softmax}(x)` is +# +# .. math:: \frac{\exp(x_i)}{\sum_j \exp(x_j)} +# +# It should be clear that the output is a probability distribution: each +# element is non-negative and the sum over all components is 1. +# +# You could also think of it as just applying an element-wise +# exponentiation operator to the input to make everything non-negative and +# then dividing by the normalization constant. +# + +# Softmax is also in torch.functional +data = autograd.Variable(torch.randn(5)) +print(data) +print(F.softmax(data)) +print(F.softmax(data).sum()) # Sums to 1 because it is a distribution! +print(F.log_softmax(data)) # theres also log_softmax + + +###################################################################### +# Objective Functions +# ~~~~~~~~~~~~~~~~~~~ +# +# The objective function is the function that your network is being +# trained to minimize (in which case it is often called a *loss function* +# or *cost function*). This proceeds by first choosing a training +# instance, running it through your neural network, and then computing the +# loss of the output. The parameters of the model are then updated by +# taking the derivative of the loss function. Intuitively, if your model +# is completely confident in its answer, and its answer is wrong, your +# loss will be high. If it is very confident in its answer, and its answer +# is correct, the loss will be low. +# +# The idea behind minimizing the loss function on your training examples +# is that your network will hopefully generalize well and have small loss +# on unseen examples in your dev set, test set, or in production. An +# example loss function is the *negative log likelihood loss*, which is a +# very common objective for multi-class classification. For supervised +# multi-class classification, this means training the network to minimize +# the negative log probability of the correct output (or equivalently, +# maximize the log probability of the correct output). +# + + +###################################################################### +# Optimization and Training +# ========================= +# +# So what we can compute a loss function for an instance? What do we do +# with that? We saw earlier that autograd.Variable's know how to compute +# gradients with respect to the things that were used to compute it. Well, +# since our loss is an autograd.Variable, we can compute gradients with +# respect to all of the parameters used to compute it! Then we can perform +# standard gradient updates. Let :math:`\theta` be our parameters, +# :math:`L(\theta)` the loss function, and :math:`\eta` a positive +# learning rate. Then: +# +# .. math:: \theta^{(t+1)} = \theta^{(t)} - \eta \nabla_\theta L(\theta) +# +# There are a huge collection of algorithms and active research in +# attempting to do something more than just this vanilla gradient update. +# Many attempt to vary the learning rate based on what is happening at +# train time. You don't need to worry about what specifically these +# algorithms are doing unless you are really interested. Torch provies +# many in the torch.optim package, and they are all completely +# transparent. Using the simplest gradient update is the same as the more +# complicated algorithms. Trying different update algorithms and different +# parameters for the update algorithms (like different initial learning +# rates) is important in optimizing your network's performance. Often, +# just replacing vanilla SGD with an optimizer like Adam or RMSProp will +# boost performance noticably. +# + + +###################################################################### +# Creating Network Components in Pytorch +# ====================================== +# +# Before we move on to our focus on NLP, lets do an annotated example of +# building a network in Pytorch using only affine maps and +# non-linearities. We will also see how to compute a loss function, using +# Pytorch's built in negative log likelihood, and update parameters by +# backpropagation. +# +# All network components should inherit from nn.Module and override the +# forward() method. That is about it, as far as the boilerplate is +# concerned. Inheriting from nn.Module provides functionality to your +# component. For example, it makes it keep track of its trainable +# parameters, you can swap it between CPU and GPU with the .cuda() or +# .cpu() functions, etc. +# +# Let's write an annotated example of a network that takes in a sparse +# bag-of-words representation and outputs a probability distribution over +# two labels: "English" and "Spanish". This model is just logistic +# regression. +# + + +###################################################################### +# Example: Logistic Regression Bag-of-Words classifier +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Our model will map a sparse BOW representation to log probabilities over +# labels. We assign each word in the vocab an index. For example, say our +# entire vocab is two words "hello" and "world", with indices 0 and 1 +# respectively. The BoW vector for the sentence "hello hello hello hello" +# is +# +# .. math:: \left[ 4, 0 \right] +# +# For "hello world world hello", it is +# +# .. math:: \left[ 2, 2 \right] +# +# etc. In general, it is +# +# .. math:: \left[ \text{Count}(\text{hello}), \text{Count}(\text{world}) \right] +# +# Denote this BOW vector as :math:`x`. The output of our network is: +# +# .. math:: \log \text{Softmax}(Ax + b) +# +# That is, we pass the input through an affine map and then do log +# softmax. +# + +data = [("me gusta comer en la cafeteria".split(), "SPANISH"), + ("Give it to me".split(), "ENGLISH"), + ("No creo que sea una buena idea".split(), "SPANISH"), + ("No it is not a good idea to get lost at sea".split(), "ENGLISH")] + +test_data = [("Yo creo que si".split(), "SPANISH"), + ("it is lost on me".split(), "ENGLISH")] + +# word_to_ix maps each word in the vocab to a unique integer, which will be its +# index into the Bag of words vector +word_to_ix = {} +for sent, _ in data + test_data: + for word in sent: + if word not in word_to_ix: + word_to_ix[word] = len(word_to_ix) +print(word_to_ix) + +VOCAB_SIZE = len(word_to_ix) +NUM_LABELS = 2 + + +class BoWClassifier(nn.Module): # inheriting from nn.Module! + + def __init__(self, num_labels, vocab_size): + # calls the init function of nn.Module. Dont get confused by syntax, + # just always do it in an nn.Module + super(BoWClassifier, self).__init__() + + # Define the parameters that you will need. In this case, we need A and b, + # the parameters of the affine mapping. + # Torch defines nn.Linear(), which provides the affine map. + # Make sure you understand why the input dimension is vocab_size + # and the output is num_labels! + self.linear = nn.Linear(vocab_size, num_labels) + + # NOTE! The non-linearity log softmax does not have parameters! So we don't need + # to worry about that here + + def forward(self, bow_vec): + # Pass the input through the linear layer, + # then pass that through log_softmax. + # Many non-linearities and other functions are in torch.nn.functional + return F.log_softmax(self.linear(bow_vec)) + + +def make_bow_vector(sentence, word_to_ix): + vec = torch.zeros(len(word_to_ix)) + for word in sentence: + vec[word_to_ix[word]] += 1 + return vec.view(1, -1) + + +def make_target(label, label_to_ix): + return torch.LongTensor([label_to_ix[label]]) + +model = BoWClassifier(NUM_LABELS, VOCAB_SIZE) + +# the model knows its parameters. The first output below is A, the second is b. +# Whenever you assign a component to a class variable in the __init__ function +# of a module, which was done with the line +# self.linear = nn.Linear(...) +# Then through some Python magic from the Pytorch devs, your module +# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters +for param in model.parameters(): + print(param) + +# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable +sample = data[0] +bow_vector = make_bow_vector(sample[0], word_to_ix) +log_probs = model(autograd.Variable(bow_vector)) +print(log_probs) + + +###################################################################### +# Which of the above values corresponds to the log probability of ENGLISH, +# and which to SPANISH? We never defined it, but we need to if we want to +# train the thing. +# + +label_to_ix = {"SPANISH": 0, "ENGLISH": 1} + + +###################################################################### +# So lets train! To do this, we pass instances through to get log +# probabilities, compute a loss function, compute the gradient of the loss +# function, and then update the parameters with a gradient step. Loss +# functions are provided by Torch in the nn package. nn.NLLLoss() is the +# negative log likelihood loss we want. It also defines optimization +# functions in torch.optim. Here, we will just use SGD. +# +# Note that the *input* to NLLLoss is a vector of log probabilities, and a +# target label. It doesn't compute the log probabilities for us. This is +# why the last layer of our network is log softmax. The loss function +# nn.CrossEntropyLoss() is the same as NLLLoss(), except it does the log +# softmax for you. +# + +# Run on test data before we train, just to see a before-and-after +for instance, label in test_data: + bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) + log_probs = model(bow_vec) + print(log_probs) + +# Print the matrix column corresponding to "creo" +print(next(model.parameters())[:, word_to_ix["creo"]]) + +loss_function = nn.NLLLoss() +optimizer = optim.SGD(model.parameters(), lr=0.1) + +# Usually you want to pass over the training data several times. +# 100 is much bigger than on a real data set, but real datasets have more than +# two instances. Usually, somewhere between 5 and 30 epochs is reasonable. +for epoch in range(100): + for instance, label in data: + # Step 1. Remember that Pytorch accumulates gradients. + # We need to clear them out before each instance + model.zero_grad() + + # Step 2. Make our BOW vector and also we must wrap the target in a + # Variable as an integer. For example, if the target is SPANISH, then + # we wrap the integer 0. The loss function then knows that the 0th + # element of the log probabilities is the log probability + # corresponding to SPANISH + bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) + target = autograd.Variable(make_target(label, label_to_ix)) + + # Step 3. Run our forward pass. + log_probs = model(bow_vec) + + # Step 4. Compute the loss, gradients, and update the parameters by + # calling optimizer.step() + loss = loss_function(log_probs, target) + loss.backward() + optimizer.step() + +for instance, label in test_data: + bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix)) + log_probs = model(bow_vec) + print(log_probs) + +# Index corresponding to Spanish goes up, English goes down! +print(next(model.parameters())[:, word_to_ix["creo"]]) + + +###################################################################### +# We got the right answer! You can see that the log probability for +# Spanish is much higher in the first example, and the log probability for +# English is much higher in the second for the test data, as it should be. +# +# Now you see how to make a Pytorch component, pass some data through it +# and do gradient updates. We are ready to dig deeper into what deep NLP +# has to offer. +# diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py new file mode 100644 index 00000000000..b07490a9c8b --- /dev/null +++ b/beginner_source/nlp/pytorch_tutorial.py @@ -0,0 +1,281 @@ +# -*- coding: utf-8 -*- +r""" +Introduction to PyTorch +*********************** + +Introduction to Torch's tensor library +====================================== + +All of deep learning is computations on tensors, which are +generalizations of a matrix that can be indexed in more than 2 +dimensions. We will see exactly what this means in-depth later. First, +lets look what we can do with tensors. +""" +# Author: Robert Guthrie + +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +torch.manual_seed(1) + + +###################################################################### +# Creating Tensors +# ~~~~~~~~~~~~~~~~ +# +# Tensors can be created from Python lists with the torch.Tensor() +# function. +# + +# Create a torch.Tensor object with the given data. It is a 1D vector +V_data = [1., 2., 3.] +V = torch.Tensor(V_data) +print(V) + +# Creates a matrix +M_data = [[1., 2., 3.], [4., 5., 6]] +M = torch.Tensor(M_data) +print(M) + +# Create a 3D tensor of size 2x2x2. +T_data = [[[1., 2.], [3., 4.]], + [[5., 6.], [7., 8.]]] +T = torch.Tensor(T_data) +print(T) + + +###################################################################### +# What is a 3D tensor anyway? Think about it like this. If you have a +# vector, indexing into the vector gives you a scalar. If you have a +# matrix, indexing into the matrix gives you a vector. If you have a 3D +# tensor, then indexing into the tensor gives you a matrix! +# +# A note on terminology: +# when I say "tensor" in this tutorial, it refers +# to any torch.Tensor object. Matrices and vectors are special cases of +# torch.Tensors, where their dimension is 1 and 2 respectively. When I am +# talking about 3D tensors, I will explicitly use the term "3D tensor". +# + +# Index into V and get a scalar +print(V[0]) + +# Index into M and get a vector +print(M[0]) + +# Index into T and get a matrix +print(T[0]) + + +###################################################################### +# You can also create tensors of other datatypes. The default, as you can +# see, is Float. To create a tensor of integer types, try +# torch.LongTensor(). Check the documentation for more data types, but +# Float and Long will be the most common. +# + + +###################################################################### +# You can create a tensor with random data and the supplied dimensionality +# with torch.randn() +# + +x = torch.randn((3, 4, 5)) +print(x) + + +###################################################################### +# Operations with Tensors +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# You can operate on tensors in the ways you would expect. + +x = torch.Tensor([1., 2., 3.]) +y = torch.Tensor([4., 5., 6.]) +z = x + y +print(z) + + +###################################################################### +# See `the documentation `__ for a +# complete list of the massive number of operations available to you. They +# expand beyond just mathematical operations. +# +# One helpful operation that we will make use of later is concatenation. +# + +# By default, it concatenates along the first axis (concatenates rows) +x_1 = torch.randn(2, 5) +y_1 = torch.randn(3, 5) +z_1 = torch.cat([x_1, y_1]) +print(z_1) + +# Concatenate columns: +x_2 = torch.randn(2, 3) +y_2 = torch.randn(2, 5) +# second arg specifies which axis to concat along +z_2 = torch.cat([x_2, y_2], 1) +print(z_2) + +# If your tensors are not compatible, torch will complain. Uncomment to see the error +# torch.cat([x_1, x_2]) + + +###################################################################### +# Reshaping Tensors +# ~~~~~~~~~~~~~~~~~ +# +# Use the .view() method to reshape a tensor. This method receives heavy +# use, because many neural network components expect their inputs to have +# a certain shape. Often you will need to reshape before passing your data +# to the component. +# + +x = torch.randn(2, 3, 4) +print(x) +print(x.view(2, 12)) # Reshape to 2 rows, 12 columns +# Same as above. If one of the dimensions is -1, its size can be inferred +print(x.view(2, -1)) + + +###################################################################### +# Computation Graphs and Automatic Differentiation +# ================================================ +# +# The concept of a computation graph is essential to efficient deep +# learning programming, because it allows you to not have to write the +# back propagation gradients yourself. A computation graph is simply a +# specification of how your data is combined to give you the output. Since +# the graph totally specifies what parameters were involved with which +# operations, it contains enough information to compute derivatives. This +# probably sounds vague, so lets see what is going on using the +# fundamental class of Pytorch: autograd.Variable. +# +# First, think from a programmers perspective. What is stored in the +# torch.Tensor objects we were creating above? Obviously the data and the +# shape, and maybe a few other things. But when we added two tensors +# together, we got an output tensor. All this output tensor knows is its +# data and shape. It has no idea that it was the sum of two other tensors +# (it could have been read in from a file, it could be the result of some +# other operation, etc.) +# +# The Variable class keeps track of how it was created. Lets see it in +# action. +# + +# Variables wrap tensor objects +x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True) +# You can access the data with the .data attribute +print(x.data) + +# You can also do all the same operations you did with tensors with Variables. +y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True) +z = x + y +print(z.data) + +# BUT z knows something extra. +print(z.creator) + + +###################################################################### +# So Variables know what created them. z knows that it wasn't read in from +# a file, it wasn't the result of a multiplication or exponential or +# whatever. And if you keep following z.creator, you will find yourself at +# x and y. +# +# But how does that help us compute a gradient? +# + +# Lets sum up all the entries in z +s = z.sum() +print(s) +print(s.creator) + + +###################################################################### +# So now, what is the derivative of this sum with respect to the first +# component of x? In math, we want +# +# .. math:: +# +# \frac{\partial s}{\partial x_0} +# +# +# +# Well, s knows that it was created as a sum of the tensor z. z knows +# that it was the sum x + y. So +# +# .. math:: s = \overbrace{x_0 + y_0}^\text{$z_0$} + \overbrace{x_1 + y_1}^\text{$z_1$} + \overbrace{x_2 + y_2}^\text{$z_2$} +# +# And so s contains enough information to determine that the derivative +# we want is 1! +# +# Of course this glosses over the challenge of how to actually compute +# that derivative. The point here is that s is carrying along enough +# information that it is possible to compute it. In reality, the +# developers of Pytorch program the sum() and + operations to know how to +# compute their gradients, and run the back propagation algorithm. An +# in-depth discussion of that algorithm is beyond the scope of this +# tutorial. +# + + +###################################################################### +# Lets have Pytorch compute the gradient, and see that we were right: +# (note if you run this block multiple times, the gradient will increment. +# That is because Pytorch *accumulates* the gradient into the .grad +# property, since for many models this is very convenient.) +# + +# calling .backward() on any variable will run backprop, starting from it. +s.backward() +print(x.grad) + + +###################################################################### +# Understanding what is going on in the block below is crucial for being a +# successful programmer in deep learning. +# + +x = torch.randn((2, 2)) +y = torch.randn((2, 2)) +z = x + y # These are Tensor types, and backprop would not be possible + +var_x = autograd.Variable(x) +var_y = autograd.Variable(y) +# var_z contains enough information to compute gradients, as we saw above +var_z = var_x + var_y +print(var_z.creator) + +var_z_data = var_z.data # Get the wrapped Tensor object out of var_z... +# Re-wrap the tensor in a new variable +new_var_z = autograd.Variable(var_z_data) + +# ... does new_var_z have information to backprop to x and y? +# NO! +print(new_var_z.creator) +# And how could it? We yanked the tensor out of var_z (that is +# what var_z.data is). This tensor doesn't know anything about +# how it was computed. We pass it into new_var_z, and this is all the +# information new_var_z gets. If var_z_data doesn't know how it was +# computed, theres no way new_var_z will. +# In essence, we have broken the variable away from its past history + + +###################################################################### +# Here is the basic, extremely important rule for computing with +# autograd.Variables (note this is more general than Pytorch. There is an +# equivalent object in every major deep learning toolkit): +# +# **If you want the error from your loss function to backpropogate to a +# component of your network, you MUST NOT break the Variable chain from +# that component to your loss Variable. If you do, the loss will have no +# idea your component exists, and its parameters can't be updated.** +# +# I say this in bold, because this error can creep up on you in very +# subtle ways (I will show some such ways below), and it will not cause +# your code to crash or complain, so you must be careful. +# diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py new file mode 100644 index 00000000000..dd35ebda1ef --- /dev/null +++ b/beginner_source/nlp/sequence_models_tutorial.py @@ -0,0 +1,259 @@ +# -*- coding: utf-8 -*- +r""" +Sequence Models and Long-Short Term Memory Networks +=================================================== + +At this point, we have seen various feed-forward networks. That is, +there is no state maintained by the network at all. This might not be +the behavior we want. Sequence models are central to NLP: they are +models where there is some sort of dependence through time between your +inputs. The classical example of a sequence model is the Hidden Markov +Model for part-of-speech tagging. Another example is the conditional +random field. + +A recurrent neural network is a network that maintains some kind of +state. For example, its output could be used as part of the next input, +so that information can propogate along as the network passes over the +sequence. In the case of an LSTM, for each element in the sequence, +there is a corresponding *hidden state* :math:`h_t`, which in principle +can contain information from arbitrary points earlier in the sequence. +We can use the hidden state to predict words in a language model, +part-of-speech tags, and a myriad of other things. + + +LSTM's in Pytorch +~~~~~~~~~~~~~~~~~ + +Before getting to the example, note a few things. Pytorch's LSTM expects +all of its inputs to be 3D tensors. The semantics of the axes of these +tensors is important. The first axis is the sequence itself, the second +indexes instances in the mini-batch, and the third indexes elements of +the input. We haven't discussed mini-batching, so lets just ignore that +and assume we will always have just 1 dimension on the second axis. If +we want to run the sequence model over the sentence "The cow jumped", +our input should look like + +.. math:: + + + \begin{bmatrix} + \overbrace{q_\text{The}}^\text{row vector} \\ + q_\text{cow} \\ + q_\text{jumped} + \end{bmatrix} + +Except remember there is an additional 2nd dimension with size 1. + +In addition, you could go through the sequence one at a time, in which +case the 1st axis will have size 1 also. + +Let's see a quick example. +""" + +# Author: Robert Guthrie + +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +torch.manual_seed(1) + +###################################################################### + +lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3 +inputs = [autograd.Variable(torch.randn((1, 3))) + for _ in range(5)] # make a sequence of length 5 + +# initialize the hidden state. +hidden = (autograd.Variable(torch.randn(1, 1, 3)), + autograd.Variable(torch.randn((1, 1, 3)))) +for i in inputs: + # Step through the sequence one element at a time. + # after each step, hidden contains the hidden state. + out, hidden = lstm(i.view(1, 1, -1), hidden) + +# alternatively, we can do the entire sequence all at once. +# the first value returned by LSTM is all of the hidden states throughout +# the sequence. the second is just the most recent hidden state +# (compare the last slice of "out" with "hidden" below, they are the same) +# The reason for this is that: +# "out" will give you access to all hidden states in the sequence +# "hidden" will allow you to continue the sequence and backpropogate, +# by passing it as an argument to the lstm at a later time +# Add the extra 2nd dimension +inputs = torch.cat(inputs).view(len(inputs), 1, -1) +hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable( + torch.randn((1, 1, 3)))) # clean out hidden state +out, hidden = lstm(inputs, hidden) +print(out) +print(hidden) + + +###################################################################### +# Example: An LSTM for Part-of-Speech Tagging +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In this section, we will use an LSTM to get part of speech tags. We will +# not use Viterbi or Forward-Backward or anything like that, but as a +# (challenging) exercise to the reader, think about how Viterbi could be +# used after you have seen what is going on. +# +# The model is as follows: let our input sentence be +# :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let +# :math:`T` be our tag set, and :math:`y_i` the tag of word :math:`w_i`. +# Denote our prediction of the tag of word :math:`w_i` by +# :math:`\hat{y}_i`. +# +# This is a structure prediction, model, where our output is a sequence +# :math:`\hat{y}_1, \dots, \hat{y}_M`, where :math:`\hat{y}_i \in T`. +# +# To do the prediction, pass an LSTM over the sentence. Denote the hidden +# state at timestep :math:`i` as :math:`h_i`. Also, assign each tag a +# unique index (like how we had word\_to\_ix in the word embeddings +# section). Then our prediction rule for :math:`\hat{y}_i` is +# +# .. math:: \hat{y}_i = \text{argmax}_j \ (\log \text{Softmax}(Ah_i + b))_j +# +# That is, take the log softmax of the affine map of the hidden state, +# and the predicted tag is the tag that has the maximum value in this +# vector. Note this implies immediately that the dimensionality of the +# target space of :math:`A` is :math:`|T|`. +# +# +# Prepare data: + +def prepare_sequence(seq, to_ix): + idxs = [to_ix[w] for w in seq] + tensor = torch.LongTensor(idxs) + return autograd.Variable(tensor) + +training_data = [ + ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]), + ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]) +] +word_to_ix = {} +for sent, tags in training_data: + for word in sent: + if word not in word_to_ix: + word_to_ix[word] = len(word_to_ix) +print(word_to_ix) +tag_to_ix = {"DET": 0, "NN": 1, "V": 2} + +# These will usually be more like 32 or 64 dimensional. +# We will keep them small, so we can see how the weights change as we train. +EMBEDDING_DIM = 6 +HIDDEN_DIM = 6 + +###################################################################### +# Create the model: + +class LSTMTagger(nn.Module): + + def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size): + super(LSTMTagger, self).__init__() + self.hidden_dim = hidden_dim + + self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) + + # The LSTM takes word embeddings as inputs, and outputs hidden states + # with dimensionality hidden_dim. + self.lstm = nn.LSTM(embedding_dim, hidden_dim) + + # The linear layer that maps from hidden state space to tag space + self.hidden2tag = nn.Linear(hidden_dim, tagset_size) + self.hidden = self.init_hidden() + + def init_hidden(self): + # Before we've done anything, we dont have any hidden state. + # Refer to the Pytorch documentation to see exactly + # why they have this dimensionality. + # The axes semantics are (num_layers, minibatch_size, hidden_dim) + return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), + autograd.Variable(torch.zeros(1, 1, self.hidden_dim))) + + def forward(self, sentence): + embeds = self.word_embeddings(sentence) + lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1)) + tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) + tag_scores = F.log_softmax(tag_space) + return tag_scores + +###################################################################### +# Train the model: + +model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)) +loss_function = nn.NLLLoss() +optimizer = optim.SGD(model.parameters(), lr=0.1) + +# See what the scores are before training +# Note that element i,j of the output is the score for tag j for word i. +inputs = prepare_sequence(training_data[0][0], word_to_ix) +tag_scores = model(inputs) +print(tag_scores) + +for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data + for sentence, tags in training_data: + # Step 1. Remember that Pytorch accumulates gradients. + # We need to clear them out before each instance + model.zero_grad() + + # Also, we need to clear out the hidden state of the LSTM, + # detaching it from its history on the last instance. + model.hidden = model.init_hidden() + + # Step 2. Get our inputs ready for the network, that is, turn them into + # Variables of word indices. + sentence_in = prepare_sequence(sentence, word_to_ix) + targets = prepare_sequence(tags, tag_to_ix) + + # Step 3. Run our forward pass. + tag_scores = model(sentence_in) + + # Step 4. Compute the loss, gradients, and update the parameters by + # calling optimizer.step() + loss = loss_function(tag_scores, targets) + loss.backward() + optimizer.step() + +# See what the scores are after training +inputs = prepare_sequence(training_data[0][0], word_to_ix) +tag_scores = model(inputs) +# The sentence is "the dog ate the apple". i,j corresponds to score for tag j +# for word i. The predicted tag is the maximum scoring tag. +# Here, we can see the predicted sequence below is 0 1 2 0 1 +# since 0 is index of the maximum value of row 1, +# 1 is the index of maximum value of row 2, etc. +# Which is DET NOUN VERB DET NOUN, the correct sequence! +print(tag_scores) + + +###################################################################### +# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In the example above, each word had an embedding, which served as the +# inputs to our sequence model. Let's augment the word embeddings with a +# representation derived from the characters of the word. We expect that +# this should help significantly, since character-level information like +# affixes have a large bearing on part-of-speech. For example, words with +# the affix *-ly* are almost always tagged as adverbs in English. +# +# Do do this, let :math:`c_w` be the character-level representation of +# word :math:`w`. Let :math:`x_w` be the word embedding as before. Then +# the input to our sequence model is the concatenation of :math:`x_w` and +# :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w` +# dimension 3, then our LSTM should accept an input of dimension 8. +# +# To get the character level representation, do an LSTM over the +# characters of a word, and let :math:`c_w` be the final hidden state of +# this LSTM. Hints: +# +# * There are going to be two LSTM's in your new model. +# The original one that outputs POS tag scores, and the new one that +# outputs a character-level representation of each word. +# * To do a sequence model over characters, you will have to embed characters. +# The character embeddings will be the input to the character LSTM. +# + diff --git a/beginner_source/nlp/word_embeddings_tutorial.py b/beginner_source/nlp/word_embeddings_tutorial.py new file mode 100644 index 00000000000..1dc177daa0b --- /dev/null +++ b/beginner_source/nlp/word_embeddings_tutorial.py @@ -0,0 +1,338 @@ +# -*- coding: utf-8 -*- +r""" +Word Embeddings: Encoding Lexical Semantics +=========================================== + +Word embeddings are dense vectors of real numbers, one per word in your +vocabulary. In NLP, it is almost always the case that your features are +words! But how should you represent a word in a computer? You could +store its ascii character representation, but that only tells you what +the word *is*, it doesn't say much about what it *means* (you might be +able to derive its part of speech from its affixes, or properties from +its capitalization, but not much). Even more, in what sense could you +combine these representations? We often want dense outputs from our +neural networks, where the inputs are :math:`|V|` dimensional, where +:math:`V` is our vocabulary, but often the outputs are only a few +dimensional (if we are only predicting a handful of labels, for +instance). How do we get from a massive dimensional space to a smaller +dimensional space? + +How about instead of ascii representations, we use a one-hot encoding? +That is, we represent the word :math:`w` by + +.. math:: \overbrace{\left[ 0, 0, \dots, 1, \dots, 0, 0 \right]}^\text{|V| elements} + +where the 1 is in a location unique to :math:`w`. Any other word will +have a 1 in some other location, and a 0 everywhere else. + +There is an enormous drawback to this representation, besides just how +huge it is. It basically treats all words as independent entities with +no relation to each other. What we really want is some notion of +*similarity* between words. Why? Let's see an example. + +Suppose we are building a language model. Suppose we have seen the +sentences + +* The mathematician ran to the store. +* The physicist ran to the store. +* The mathematician solved the open problem. + +in our training data. Now suppose we get a new sentence never before +seen in our training data: + +* The physicist solved the open problem. + +Our language model might do OK on this sentence, but wouldn't it be much +better if we could use the following two facts: + +* We have seen mathematician and physicist in the same role in a sentence. Somehow they + have a semantic relation. +* We have seen mathematician in the same role in this new unseen sentence + as we are now seeing physicist. + +and then infer that physicist is actually a good fit in the new unseen +sentence? This is what we mean by a notion of similarity: we mean +*semantic similarity*, not simply having similar orthographic +representations. It is a technique to combat the sparsity of linguistic +data, by connecting the dots between what we have seen and what we +haven't. This example of course relies on a fundamental linguistic +assumption: that words appearing in similar contexts are related to each +other semantically. This is called the `distributional +hypothesis `__. + + +Getting Dense Word Embeddings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +How can we solve this problem? That is, how could we actually encode +semantic similarity in words? Maybe we think up some semantic +attributes. For example, we see that both mathematicians and physicists +can run, so maybe we give these words a high score for the "is able to +run" semantic attribute. Think of some other attributes, and imagine +what you might score some common words on those attributes. + +If each attribute is a dimension, then we might give each word a vector, +like this: + +.. math:: + + q_\text{mathematician} = \left[ \overbrace{2.3}^\text{can run}, + \overbrace{9.4}^\text{likes coffee}, \overbrace{-5.5}^\text{majored in Physics}, \dots \right] + +.. math:: + + q_\text{physicist} = \left[ \overbrace{2.5}^\text{can run}, + \overbrace{9.1}^\text{likes coffee}, \overbrace{6.4}^\text{majored in Physics}, \dots \right] + +Then we can get a measure of similarity between these words by doing: + +.. math:: \text{Similarity}(\text{physicist}, \text{mathematician}) = q_\text{physicist} \cdot q_\text{mathematician} + +Although it is more common to normalize by the lengths: + +.. math:: + + \text{Similarity}(\text{physicist}, \text{mathematician}) = \frac{q_\text{physicist} \cdot q_\text{mathematician}} + {\| q_\text{\physicist} \| \| q_\text{mathematician} \|} = \cos (\phi) + +Where :math:`\phi` is the angle between the two vectors. That way, +extremely similar words (words whose embeddings point in the same +direction) will have similarity 1. Extremely dissimilar words should +have similarity -1. + + +You can think of the sparse one-hot vectors from the beginning of this +section as a special case of these new vectors we have defined, where +each word basically has similarity 0, and we gave each word some unique +semantic attribute. These new vectors are *dense*, which is to say their +entries are (typically) non-zero. + +But these new vectors are a big pain: you could think of thousands of +different semantic attributes that might be relevant to determining +similarity, and how on earth would you set the values of the different +attributes? Central to the idea of deep learning is that the neural +network learns representations of the features, rather than requiring +the programmer to design them herself. So why not just let the word +embeddings be parameters in our model, and then be updated during +training? This is exactly what we will do. We will have some *latent +semantic attributes* that the network can, in principle, learn. Note +that the word embeddings will probably not be interpretable. That is, +although with our hand-crafted vectors above we can see that +mathematicians and physicists are similar in that they both like coffee, +if we allow a neural network to learn the embeddings and see that both +mathematicians and physicisits have a large value in the second +dimension, it is not clear what that means. They are similar in some +latent semantic dimension, but this probably has no interpretation to +us. + + +In summary, **word embeddings are a representation of the *semantics* of +a word, efficiently encoding semantic information that might be relevant +to the task at hand**. You can embed other things too: part of speech +tags, parse trees, anything! The idea of feature embeddings is central +to the field. + + +Word Embeddings in Pytorch +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before we get to a worked example and an exercise, a few quick notes +about how to use embeddings in Pytorch and in deep learning programming +in general. Similar to how we defined a unique index for each word when +making one-hot vectors, we also need to define an index for each word +when using embeddings. These will be keys into a lookup table. That is, +embeddings are stored as a :math:`|V| \times D` matrix, where :math:`D` +is the dimensionality of the embeddings, such that the word assigned +index :math:`i` has its embedding stored in the :math:`i`'th row of the +matrix. In all of my code, the mapping from words to indices is a +dictionary named word\_to\_ix. + +The module that allows you to use embeddings is torch.nn.Embedding, +which takes two arguments: the vocabulary size, and the dimensionality +of the embeddings. + +To index into this table, you must use torch.LongTensor (since the +indices are integers, not floats). + +""" + +# Author: Robert Guthrie + +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +torch.manual_seed(1) + +###################################################################### + +word_to_ix = {"hello": 0, "world": 1} +embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings +lookup_tensor = torch.LongTensor([word_to_ix["hello"]]) +hello_embed = embeds(autograd.Variable(lookup_tensor)) +print(hello_embed) + + +###################################################################### +# An Example: N-Gram Language Modeling +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Recall that in an n-gram language model, given a sequence of words +# :math:`w`, we want to compute +# +# .. math:: P(w_i | w_{i-1}, w_{i-2}, \dots, w_{i-n+1} ) +# +# Where :math:`w_i` is the ith word of the sequence. +# +# In this example, we will compute the loss function on some training +# examples and update the parameters with backpropagation. +# + +CONTEXT_SIZE = 2 +EMBEDDING_DIM = 10 +# We will use Shakespeare Sonnet 2 +test_sentence = """When forty winters shall besiege thy brow, +And dig deep trenches in thy beauty's field, +Thy youth's proud livery so gazed on now, +Will be a totter'd weed of small worth held: +Then being asked, where all thy beauty lies, +Where all the treasure of thy lusty days; +To say, within thine own deep sunken eyes, +Were an all-eating shame, and thriftless praise. +How much more praise deserv'd thy beauty's use, +If thou couldst answer 'This fair child of mine +Shall sum my count, and make my old excuse,' +Proving his beauty by succession thine! +This were to be new made when thou art old, +And see thy blood warm when thou feel'st it cold.""".split() +# we should tokenize the input, but we will ignore that for now +# build a list of tuples. Each tuple is ([ word_i-2, word_i-1 ], target word) +trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) + for i in range(len(test_sentence) - 2)] +# print the first 3, just so you can see what they look like +print(trigrams[:3]) + +vocab = set(test_sentence) +word_to_ix = {word: i for i, word in enumerate(vocab)} + + +class NGramLanguageModeler(nn.Module): + + def __init__(self, vocab_size, embedding_dim, context_size): + super(NGramLanguageModeler, self).__init__() + self.embeddings = nn.Embedding(vocab_size, embedding_dim) + self.linear1 = nn.Linear(context_size * embedding_dim, 128) + self.linear2 = nn.Linear(128, vocab_size) + + def forward(self, inputs): + embeds = self.embeddings(inputs).view((1, -1)) + out = F.relu(self.linear1(embeds)) + out = self.linear2(out) + log_probs = F.log_softmax(out) + return log_probs + +losses = [] +loss_function = nn.NLLLoss() +model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) +optimizer = optim.SGD(model.parameters(), lr=0.001) + +for epoch in range(10): + total_loss = torch.Tensor([0]) + for context, target in trigrams: + + # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words + # into integer indices and wrap them in variables) + context_idxs = [word_to_ix[w] for w in context] + context_var = autograd.Variable(torch.LongTensor(context_idxs)) + + # Step 2. Recall that torch *accumulates* gradients. Before passing in a + # new instance, you need to zero out the gradients from the old instance + model.zero_grad() + + # Step 3. Run the forward pass, getting log probabilities over next + # words + log_probs = model(context_var) + + # Step 4. Compute your loss function. (Again, Torch wants the target + # word wrapped in a variable) + loss = loss_function(log_probs, autograd.Variable( + torch.LongTensor([word_to_ix[target]]))) + + # Step 5. Do the backward pass and update the gradient + loss.backward() + optimizer.step() + + total_loss += loss.data + losses.append(total_loss) +print(losses) # The loss decreased every iteration over the training data! + + +###################################################################### +# Exercise: Computing Word Embeddings: Continuous Bag-of-Words +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep +# learning. It is a model that tries to predict words given the context of +# a few words before and a few words after the target word. This is +# distinct from language modeling, since CBOW is not sequential and does +# not have to be probabilistic. Typcially, CBOW is used to quickly train +# word embeddings, and these embeddings are used to initialize the +# embeddings of some more complicated model. Usually, this is referred to +# as *pretraining embeddings*. It almost always helps performance a couple +# of percent. +# +# The CBOW model is as follows. Given a target word :math:`w_i` and an +# :math:`N` context window on each side, :math:`w_{i-1}, \dots, w_{i-N}` +# and :math:`w_{i+1}, \dots, w_{i+N}`, referring to all context words +# collectively as :math:`C`, CBOW tries to minimize +# +# .. math:: -\log p(w_i | C) = \log \text{Softmax}(A(\sum_{w \in C} q_w) + b) +# +# where :math:`q_w` is the embedding for word :math:`w`. +# +# Implement this model in Pytorch by filling in the class below. Some +# tips: +# +# * Think about which parameters you need to define. +# * Make sure you know what shape each operation expects. Use .view() if you need to +# reshape. +# + +CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right +raw_text = """We are about to study the idea of a computational process. +Computational processes are abstract beings that inhabit computers. +As they evolve, processes manipulate other abstract things called data. +The evolution of a process is directed by a pattern of rules +called a program. People create programs to direct processes. In effect, +we conjure the spirits of the computer with our spells.""".split() +word_to_ix = {word: i for i, word in enumerate(raw_text)} +data = [] +for i in range(2, len(raw_text) - 2): + context = [raw_text[i - 2], raw_text[i - 1], + raw_text[i + 1], raw_text[i + 2]] + target = raw_text[i] + data.append((context, target)) +print(data[:5]) + + +class CBOW(nn.Module): + + def __init__(self): + pass + + def forward(self, inputs): + pass + +# create your model and train. here are some functions to help you make +# the data ready for use by your module + + +def make_context_vector(context, word_to_ix): + idxs = [word_to_ix[w] for w in context] + tensor = torch.LongTensor(idxs) + return autograd.Variable(tensor) + +make_context_vector(data[0][0], word_to_ix) # example + diff --git a/index.rst b/index.rst index 17986f93bc0..977501a578b 100644 --- a/index.rst +++ b/index.rst @@ -41,8 +41,11 @@ Beginner Tutorials .. galleryitem:: beginner/transfer_learning_tutorial.py -.. galleryitem:: beginner/deep_learning_nlp_tutorial.py +.. customgalleryitem:: + :tooltip: I am writing this tutorial to focus specifically on NLP for people who have never written code in any deep learning framework :figure: /_static/img/thumbnails/babel.jpg + :description: :doc:`/beginner/deep_learning_nlp_tutorial` + .. raw:: html