From b4c030bd8c2c15f9a9ffcd2960bc813f2343a3ae Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 18 Nov 2020 15:04:11 -0800 Subject: [PATCH 1/5] checkpoint --- .../torchtext_translation_tutorial.py | 189 +++++++++--------- 1 file changed, 94 insertions(+), 95 deletions(-) diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation_tutorial.py index 9354cffd242..258c5d7b12c 100644 --- a/beginner_source/torchtext_translation_tutorial.py +++ b/beginner_source/torchtext_translation_tutorial.py @@ -2,7 +2,7 @@ Language Translation with TorchText =================================== -This tutorial shows how to use several convenience classes of ``torchtext`` to preprocess +This tutorial shows how to use ``torchtext`` to preprocess data from a well-known dataset containing sentences in both English and German and use it to train a sequence-to-sequence model with attention that can translate German sentences into English. @@ -10,28 +10,17 @@ It is based off of `this tutorial `__ from PyTorch community member `Ben Trevett `__ -and was created by `Seth Weidman `__ with Ben's permission. +with Ben's permission. We update the tutorials by removing some legecy code. -By the end of this tutorial, you will be able to: - -- Preprocess sentences into a commonly-used format for NLP modeling using the following ``torchtext`` convenience classes: - - `TranslationDataset `__ - - `Field `__ - - `BucketIterator `__ +By the end of this tutorial, you will be able to preprocess sentences into tensors for NLP modeling and use `torch.utils.data.DataLoader `__ for training and validing the model. """ ###################################################################### -# `Field` and `TranslationDataset` +# Data Processing # ---------------- # ``torchtext`` has utilities for creating datasets that can be easily # iterated through for the purposes of creating a language translation -# model. One key class is a -# `Field `__, -# which specifies the way each sentence should be preprocessed, and another is the -# `TranslationDataset` ; ``torchtext`` -# has several such datasets; in this tutorial we'll use the -# `Multi30k dataset `__, which contains about -# 30,000 sentences (averaging about 13 words in length) in both English and German. +# model. In this example, we show how to tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. # # Note: the tokenization in this tutorial requires `Spacy `__ # We use Spacy because it provides strong support for tokenization in languages @@ -46,81 +35,95 @@ # # :: # -# python -m spacy download en -# python -m spacy download de -# -# With Spacy installed, the following code will tokenize each of the sentences -# in the ``TranslationDataset`` based on the tokenizer defined in the ``Field`` - -from torchtext.datasets import Multi30k -from torchtext.data import Field, BucketIterator - -SRC = Field(tokenize = "spacy", - tokenizer_language="de", - init_token = '', - eos_token = '', - lower = True) - -TRG = Field(tokenize = "spacy", - tokenizer_language="en", - init_token = '', - eos_token = '', - lower = True) - -train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), - fields = (SRC, TRG)) +# python -m spacy download en +# python -m spacy download de -###################################################################### -# Now that we've defined ``train_data``, we can see an extremely useful -# feature of ``torchtext``'s ``Field``: the ``build_vocab`` method -# now allows us to create the vocabulary associated with each language - -SRC.build_vocab(train_data, min_freq = 2) -TRG.build_vocab(train_data, min_freq = 2) - -###################################################################### -# Once these lines of code have been run, ``SRC.vocab.stoi`` will be a -# dictionary with the tokens in the vocabulary as keys and their -# corresponding indices as values; ``SRC.vocab.itos`` will be the same -# dictionary with the keys and values swapped. We won't make extensive -# use of this fact in this tutorial, but this will likely be useful in -# other NLP tasks you'll encounter. +import torchtext +import torch +from torchtext.data.utils import get_tokenizer +from collections import Counter +from torchtext.vocab import Vocab +from torchtext.utils import download_from_url, extract_archive +import io + +url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/' +train_urls = ('train.de.gz', 'train.en.gz') +val_urls = ('val.de.gz', 'val.en.gz') +test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz') + +train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls] +val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls] +test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls] + +de_tokenizer = get_tokenizer('spacy', language='de') +en_tokenizer = get_tokenizer('spacy', language='en') + +def build_vocab(filepath, tokenizer): + counter = Counter() + with io.open(filepath, encoding="utf8") as f: + for string_ in f: + counter.update(tokenizer(string_)) + return Vocab(counter, specials=['', '', '', '']) + +de_vocab = build_vocab(train_filepaths[0], de_tokenizer) +en_vocab = build_vocab(train_filepaths[1], en_tokenizer) + +def data_process(raw_de_iter, raw_en_iter): + data_ = [] + for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter): + de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], + dtype=torch.long) + en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], + dtype=torch.long) + data_.append((de_tensor_, en_tensor_)) + return data_ + +train_data = data_process(iter(io.open(train_filepaths[0])), + iter(io.open(train_filepaths[1]))) +val_data = data_process(iter(io.open(val_filepaths[0])), + iter(io.open(val_filepaths[1]))) +test_data = data_process(iter(io.open(test_filepaths[0])), + iter(io.open(test_filepaths[1]))) ###################################################################### -# ``BucketIterator`` +# ``DataLoader`` # ---------------- -# The last ``torchtext`` specific feature we'll use is the ``BucketIterator``, -# which is easy to use since it takes a ``TranslationDataset`` as its +# The last ``torch`` specific feature we'll use is the ``DataLoader``, +# which is easy to use since it takes the data as its # first argument. Specifically, as the docs say: -# Defines an iterator that batches examples of similar lengths together. -# Minimizes amount of padding needed while producing freshly shuffled -# batches for each new epoch. See pool for the bucketing procedure used. +# ``DataLoader`` combines a dataset and a sampler, and provides an iterable over the given dataset. The ``DataLoader`` supports both map-style and iterable-style datasets with single- or multi-process loading, customizing loading order and optional automatic batching (collation) and memory pinning. +# +# Please pay attention to ``collate_fn`` (optional) that merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset. +# import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 128 +PAD_IDX = de_vocab[''] +BOS_IDX = de_vocab[''] +EOS_IDX = de_vocab[''] + +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader + +def generate_batch(data_batch): + de_batch, en_batch = [], [] + for (de_item, en_item) in data_batch: + de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0)) + en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0)) + de_batch = pad_sequence(de_batch, padding_value=PAD_IDX) + en_batch = pad_sequence(en_batch, padding_value=PAD_IDX) + return de_batch, en_batch + +train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, + shuffle=True, collate_fn=generate_batch) +valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE, + shuffle=True, collate_fn=generate_batch) +test_iter = DataLoader(test_data, batch_size=BATCH_SIZE, + shuffle=True, collate_fn=generate_batch) -train_iterator, valid_iterator, test_iterator = BucketIterator.splits( - (train_data, valid_data, test_data), - batch_size = BATCH_SIZE, - device = device) - -###################################################################### -# These iterators can be called just like ``DataLoader``s; below, in -# the ``train`` and ``evaluate`` functions, they are called simply with: -# -# :: -# -# for i, batch in enumerate(iterator): -# -# Each ``batch`` then has ``src`` and ``trg`` attributes: -# -# :: -# -# src = batch.src -# trg = batch.trg ###################################################################### # Defining our ``nn.Module`` and ``Optimizer`` @@ -329,8 +332,8 @@ def forward(self, return outputs -INPUT_DIM = len(SRC.vocab) -OUTPUT_DIM = len(TRG.vocab) +INPUT_DIM = len(de_vocab) +OUTPUT_DIM = len(en_vocab) # ENC_EMB_DIM = 256 # DEC_EMB_DIM = 256 # ENC_HID_DIM = 512 @@ -380,7 +383,7 @@ def count_parameters(model: nn.Module): # particular, we have to tell the ``nn.CrossEntropyLoss`` function to # ignore the indices where the target is simply padding. -PAD_IDX = TRG.vocab.stoi[''] +PAD_IDX = en_vocab.stoi[''] criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) @@ -392,7 +395,7 @@ def count_parameters(model: nn.Module): def train(model: nn.Module, - iterator: BucketIterator, + iterator, optimizer: optim.Optimizer, criterion: nn.Module, clip: float): @@ -401,10 +404,8 @@ def train(model: nn.Module, epoch_loss = 0 - for _, batch in enumerate(iterator): - - src = batch.src - trg = batch.trg + for _, (src, trg) in enumerate(iterator): + src, trg = src.to(device), trg.to(device) optimizer.zero_grad() @@ -427,7 +428,7 @@ def train(model: nn.Module, def evaluate(model: nn.Module, - iterator: BucketIterator, + iterator, criterion: nn.Module): model.eval() @@ -436,10 +437,8 @@ def evaluate(model: nn.Module, with torch.no_grad(): - for _, batch in enumerate(iterator): - - src = batch.src - trg = batch.trg + for _, (src, trg) in enumerate(iterator): + src, trg = src.to(device), trg.to(device) output = model(src, trg, 0) #turn off teacher forcing @@ -470,8 +469,8 @@ def epoch_time(start_time: int, start_time = time.time() - train_loss = train(model, train_iterator, optimizer, criterion, CLIP) - valid_loss = evaluate(model, valid_iterator, criterion) + train_loss = train(model, train_iter, optimizer, criterion, CLIP) + valid_loss = evaluate(model, valid_iter, criterion) end_time = time.time() @@ -481,7 +480,7 @@ def epoch_time(start_time: int, print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}') -test_loss = evaluate(model, test_iterator, criterion) +test_loss = evaluate(model, test_iter, criterion) print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |') From c63ee4fb4f5afeeddb3a6cc88ee1e07876da730c Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 18 Nov 2020 15:10:50 -0800 Subject: [PATCH 2/5] checkpoint --- beginner_source/torchtext_translation_tutorial.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation_tutorial.py index 258c5d7b12c..170cdc1bcfd 100644 --- a/beginner_source/torchtext_translation_tutorial.py +++ b/beginner_source/torchtext_translation_tutorial.py @@ -35,8 +35,8 @@ # # :: # -# python -m spacy download en -# python -m spacy download de +# python -m spacy download en +# python -m spacy download de import torchtext import torch @@ -395,7 +395,7 @@ def count_parameters(model: nn.Module): def train(model: nn.Module, - iterator, + iterator: torch.utils.data.DataLoader, optimizer: optim.Optimizer, criterion: nn.Module, clip: float): @@ -428,7 +428,7 @@ def train(model: nn.Module, def evaluate(model: nn.Module, - iterator, + iterator: torch.utils.data.DataLoader, criterion: nn.Module): model.eval() From 21a1115fb98a5ba2d03333716b64032a3fb55155 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 20 Nov 2020 11:09:46 -0800 Subject: [PATCH 3/5] minor changes with review's feedback --- .../torchtext_translation_tutorial.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation_tutorial.py index 170cdc1bcfd..44dd5703265 100644 --- a/beginner_source/torchtext_translation_tutorial.py +++ b/beginner_source/torchtext_translation_tutorial.py @@ -68,22 +68,20 @@ def build_vocab(filepath, tokenizer): de_vocab = build_vocab(train_filepaths[0], de_tokenizer) en_vocab = build_vocab(train_filepaths[1], en_tokenizer) -def data_process(raw_de_iter, raw_en_iter): - data_ = [] +def data_process(filepaths): + raw_de_iter, raw_en_iter = iter(io.open(filepaths[0])), iter(io.open(filepaths[1])) + data = [] for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter): de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long) en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long) - data_.append((de_tensor_, en_tensor_)) - return data_ - -train_data = data_process(iter(io.open(train_filepaths[0])), - iter(io.open(train_filepaths[1]))) -val_data = data_process(iter(io.open(val_filepaths[0])), - iter(io.open(val_filepaths[1]))) -test_data = data_process(iter(io.open(test_filepaths[0])), - iter(io.open(test_filepaths[1]))) + data.append((de_tensor_, en_tensor_)) + return data + +train_data = data_process(train_filepaths) +val_data = data_process(val_filepaths) +test_data = data_process(test_filepaths) ###################################################################### # ``DataLoader`` From 14c1f5f0cda71836e961ed3e2534280ce3712f09 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 30 Nov 2020 12:10:13 -0800 Subject: [PATCH 4/5] fix typo --- beginner_source/torchtext_translation_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation_tutorial.py index 44dd5703265..02b76b44d8a 100644 --- a/beginner_source/torchtext_translation_tutorial.py +++ b/beginner_source/torchtext_translation_tutorial.py @@ -10,7 +10,7 @@ It is based off of `this tutorial `__ from PyTorch community member `Ben Trevett `__ -with Ben's permission. We update the tutorials by removing some legecy code. +with Ben's permission. We update the tutorials by removing some legacy code. By the end of this tutorial, you will be able to preprocess sentences into tensors for NLP modeling and use `torch.utils.data.DataLoader `__ for training and validing the model. """ From 2f4398334efa257e2b19e32db0b47a9369834fc6 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 2 Dec 2020 13:10:53 -0800 Subject: [PATCH 5/5] Fix ascii decode error --- beginner_source/torchtext_translation_tutorial.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation_tutorial.py index 02b76b44d8a..ce99c2b8f0d 100644 --- a/beginner_source/torchtext_translation_tutorial.py +++ b/beginner_source/torchtext_translation_tutorial.py @@ -69,7 +69,8 @@ def build_vocab(filepath, tokenizer): en_vocab = build_vocab(train_filepaths[1], en_tokenizer) def data_process(filepaths): - raw_de_iter, raw_en_iter = iter(io.open(filepaths[0])), iter(io.open(filepaths[1])) + raw_de_iter = iter(io.open(filepaths[0], encoding="utf8")) + raw_en_iter = iter(io.open(filepaths[1], encoding="utf8")) data = [] for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter): de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],