Skip to content

Commit 133e5b6

Browse files
zhangguanheng66Guanheng Zhangbrianjo
authored
Remove legacy torchtext code from translation tutorial (#1250)
* checkpoint * checkpoint * minor changes with review's feedback * fix typo * Fix ascii decode error Co-authored-by: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair> Co-authored-by: Brian Johnson <brianjo@fb.com>
1 parent 3128e13 commit 133e5b6

File tree

1 file changed

+91
-93
lines changed

1 file changed

+91
-93
lines changed

beginner_source/torchtext_translation_tutorial.py

Lines changed: 91 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,25 @@
22
Language Translation with TorchText
33
===================================
44
5-
This tutorial shows how to use several convenience classes of ``torchtext`` to preprocess
5+
This tutorial shows how to use ``torchtext`` to preprocess
66
data from a well-known dataset containing sentences in both English and German and use it to
77
train a sequence-to-sequence model with attention that can translate German sentences
88
into English.
99
1010
It is based off of
1111
`this tutorial <https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb>`__
1212
from PyTorch community member `Ben Trevett <https://github.com/bentrevett>`__
13-
and was created by `Seth Weidman <https://github.com/SethHWeidman/>`__ with Ben's permission.
13+
with Ben's permission. We update the tutorials by removing some legacy code.
1414
15-
By the end of this tutorial, you will be able to:
16-
17-
- Preprocess sentences into a commonly-used format for NLP modeling using the following ``torchtext`` convenience classes:
18-
- `TranslationDataset <https://torchtext.readthedocs.io/en/latest/datasets.html#torchtext.datasets.TranslationDataset>`__
19-
- `Field <https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Field>`__
20-
- `BucketIterator <https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.BucketIterator>`__
15+
By the end of this tutorial, you will be able to preprocess sentences into tensors for NLP modeling and use `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__ for training and validing the model.
2116
"""
2217

2318
######################################################################
24-
# `Field` and `TranslationDataset`
19+
# Data Processing
2520
# ----------------
2621
# ``torchtext`` has utilities for creating datasets that can be easily
2722
# iterated through for the purposes of creating a language translation
28-
# model. One key class is a
29-
# `Field <https://github.com/pytorch/text/blob/master/torchtext/data/field.py#L64>`__,
30-
# which specifies the way each sentence should be preprocessed, and another is the
31-
# `TranslationDataset` ; ``torchtext``
32-
# has several such datasets; in this tutorial we'll use the
33-
# `Multi30k dataset <https://github.com/multi30k/dataset>`__, which contains about
34-
# 30,000 sentences (averaging about 13 words in length) in both English and German.
23+
# model. In this example, we show how to tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor.
3524
#
3625
# Note: the tokenization in this tutorial requires `Spacy <https://spacy.io>`__
3726
# We use Spacy because it provides strong support for tokenization in languages
@@ -48,79 +37,92 @@
4837
#
4938
# python -m spacy download en
5039
# python -m spacy download de
51-
#
52-
# With Spacy installed, the following code will tokenize each of the sentences
53-
# in the ``TranslationDataset`` based on the tokenizer defined in the ``Field``
54-
55-
from torchtext.datasets import Multi30k
56-
from torchtext.data import Field, BucketIterator
57-
58-
SRC = Field(tokenize = "spacy",
59-
tokenizer_language="de",
60-
init_token = '<sos>',
61-
eos_token = '<eos>',
62-
lower = True)
63-
64-
TRG = Field(tokenize = "spacy",
65-
tokenizer_language="en",
66-
init_token = '<sos>',
67-
eos_token = '<eos>',
68-
lower = True)
69-
70-
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
71-
fields = (SRC, TRG))
7240

73-
######################################################################
74-
# Now that we've defined ``train_data``, we can see an extremely useful
75-
# feature of ``torchtext``'s ``Field``: the ``build_vocab`` method
76-
# now allows us to create the vocabulary associated with each language
77-
78-
SRC.build_vocab(train_data, min_freq = 2)
79-
TRG.build_vocab(train_data, min_freq = 2)
80-
81-
######################################################################
82-
# Once these lines of code have been run, ``SRC.vocab.stoi`` will be a
83-
# dictionary with the tokens in the vocabulary as keys and their
84-
# corresponding indices as values; ``SRC.vocab.itos`` will be the same
85-
# dictionary with the keys and values swapped. We won't make extensive
86-
# use of this fact in this tutorial, but this will likely be useful in
87-
# other NLP tasks you'll encounter.
41+
import torchtext
42+
import torch
43+
from torchtext.data.utils import get_tokenizer
44+
from collections import Counter
45+
from torchtext.vocab import Vocab
46+
from torchtext.utils import download_from_url, extract_archive
47+
import io
48+
49+
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
50+
train_urls = ('train.de.gz', 'train.en.gz')
51+
val_urls = ('val.de.gz', 'val.en.gz')
52+
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')
53+
54+
train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
55+
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
56+
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
57+
58+
de_tokenizer = get_tokenizer('spacy', language='de')
59+
en_tokenizer = get_tokenizer('spacy', language='en')
60+
61+
def build_vocab(filepath, tokenizer):
62+
counter = Counter()
63+
with io.open(filepath, encoding="utf8") as f:
64+
for string_ in f:
65+
counter.update(tokenizer(string_))
66+
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
67+
68+
de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
69+
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
70+
71+
def data_process(filepaths):
72+
raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
73+
raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
74+
data = []
75+
for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
76+
de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
77+
dtype=torch.long)
78+
en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
79+
dtype=torch.long)
80+
data.append((de_tensor_, en_tensor_))
81+
return data
82+
83+
train_data = data_process(train_filepaths)
84+
val_data = data_process(val_filepaths)
85+
test_data = data_process(test_filepaths)
8886

8987
######################################################################
90-
# ``BucketIterator``
88+
# ``DataLoader``
9189
# ----------------
92-
# The last ``torchtext`` specific feature we'll use is the ``BucketIterator``,
93-
# which is easy to use since it takes a ``TranslationDataset`` as its
90+
# The last ``torch`` specific feature we'll use is the ``DataLoader``,
91+
# which is easy to use since it takes the data as its
9492
# first argument. Specifically, as the docs say:
95-
# Defines an iterator that batches examples of similar lengths together.
96-
# Minimizes amount of padding needed while producing freshly shuffled
97-
# batches for each new epoch. See pool for the bucketing procedure used.
93+
# ``DataLoader`` combines a dataset and a sampler, and provides an iterable over the given dataset. The ``DataLoader`` supports both map-style and iterable-style datasets with single- or multi-process loading, customizing loading order and optional automatic batching (collation) and memory pinning.
94+
#
95+
# Please pay attention to ``collate_fn`` (optional) that merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
96+
#
9897

9998
import torch
10099

101100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
102101

103102
BATCH_SIZE = 128
103+
PAD_IDX = de_vocab['<pad>']
104+
BOS_IDX = de_vocab['<bos>']
105+
EOS_IDX = de_vocab['<eos>']
106+
107+
from torch.nn.utils.rnn import pad_sequence
108+
from torch.utils.data import DataLoader
109+
110+
def generate_batch(data_batch):
111+
de_batch, en_batch = [], []
112+
for (de_item, en_item) in data_batch:
113+
de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
114+
en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
115+
de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
116+
en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
117+
return de_batch, en_batch
118+
119+
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
120+
shuffle=True, collate_fn=generate_batch)
121+
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
122+
shuffle=True, collate_fn=generate_batch)
123+
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
124+
shuffle=True, collate_fn=generate_batch)
104125

105-
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
106-
(train_data, valid_data, test_data),
107-
batch_size = BATCH_SIZE,
108-
device = device)
109-
110-
######################################################################
111-
# These iterators can be called just like ``DataLoader``s; below, in
112-
# the ``train`` and ``evaluate`` functions, they are called simply with:
113-
#
114-
# ::
115-
#
116-
# for i, batch in enumerate(iterator):
117-
#
118-
# Each ``batch`` then has ``src`` and ``trg`` attributes:
119-
#
120-
# ::
121-
#
122-
# src = batch.src
123-
# trg = batch.trg
124126

125127
######################################################################
126128
# Defining our ``nn.Module`` and ``Optimizer``
@@ -329,8 +331,8 @@ def forward(self,
329331
return outputs
330332

331333

332-
INPUT_DIM = len(SRC.vocab)
333-
OUTPUT_DIM = len(TRG.vocab)
334+
INPUT_DIM = len(de_vocab)
335+
OUTPUT_DIM = len(en_vocab)
334336
# ENC_EMB_DIM = 256
335337
# DEC_EMB_DIM = 256
336338
# ENC_HID_DIM = 512
@@ -380,7 +382,7 @@ def count_parameters(model: nn.Module):
380382
# particular, we have to tell the ``nn.CrossEntropyLoss`` function to
381383
# ignore the indices where the target is simply padding.
382384

383-
PAD_IDX = TRG.vocab.stoi['<pad>']
385+
PAD_IDX = en_vocab.stoi['<pad>']
384386

385387
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
386388

@@ -392,7 +394,7 @@ def count_parameters(model: nn.Module):
392394

393395

394396
def train(model: nn.Module,
395-
iterator: BucketIterator,
397+
iterator: torch.utils.data.DataLoader,
396398
optimizer: optim.Optimizer,
397399
criterion: nn.Module,
398400
clip: float):
@@ -401,10 +403,8 @@ def train(model: nn.Module,
401403

402404
epoch_loss = 0
403405

404-
for _, batch in enumerate(iterator):
405-
406-
src = batch.src
407-
trg = batch.trg
406+
for _, (src, trg) in enumerate(iterator):
407+
src, trg = src.to(device), trg.to(device)
408408

409409
optimizer.zero_grad()
410410

@@ -427,7 +427,7 @@ def train(model: nn.Module,
427427

428428

429429
def evaluate(model: nn.Module,
430-
iterator: BucketIterator,
430+
iterator: torch.utils.data.DataLoader,
431431
criterion: nn.Module):
432432

433433
model.eval()
@@ -436,10 +436,8 @@ def evaluate(model: nn.Module,
436436

437437
with torch.no_grad():
438438

439-
for _, batch in enumerate(iterator):
440-
441-
src = batch.src
442-
trg = batch.trg
439+
for _, (src, trg) in enumerate(iterator):
440+
src, trg = src.to(device), trg.to(device)
443441

444442
output = model(src, trg, 0) #turn off teacher forcing
445443

@@ -470,8 +468,8 @@ def epoch_time(start_time: int,
470468

471469
start_time = time.time()
472470

473-
train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
474-
valid_loss = evaluate(model, valid_iterator, criterion)
471+
train_loss = train(model, train_iter, optimizer, criterion, CLIP)
472+
valid_loss = evaluate(model, valid_iter, criterion)
475473

476474
end_time = time.time()
477475

@@ -481,7 +479,7 @@ def epoch_time(start_time: int,
481479
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
482480
print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
483481

484-
test_loss = evaluate(model, test_iterator, criterion)
482+
test_loss = evaluate(model, test_iter, criterion)
485483

486484
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
487485

0 commit comments

Comments
 (0)