|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Preprocess custom text dataset using Torchtext |
| 4 | +=============================================== |
| 5 | +
|
| 6 | +**Author**: `Anupam Sharma <https://anp-scp.github.io/>`_ |
| 7 | +
|
| 8 | +This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial, |
| 9 | +we will preprocess a dataset that can be further utilized to train a sequence-to-sequence |
| 10 | +model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning |
| 11 | +with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\ |
| 12 | +20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version |
| 13 | +of torchtext. |
| 14 | +
|
| 15 | +In this tutorial, we will learn how to: |
| 16 | +
|
| 17 | +* Read a dataset |
| 18 | +* Tokenize sentence |
| 19 | +* Apply transforms to sentence |
| 20 | +* Perform bucket batching |
| 21 | +
|
| 22 | +Let us assume that we need to prepare a dataset to train a model that can perform English to |
| 23 | +German translation. We will use a tab-delimited German - English sentence pairs provided by |
| 24 | +the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from |
| 25 | +`this link <https://www.manythings.org/anki/deu-eng.zip>`__. |
| 26 | +
|
| 27 | +Sentence pairs for other languages can be found in `this link <https://www.manythings.org/anki/>`\ |
| 28 | +__. |
| 29 | +""" |
| 30 | + |
| 31 | +# %% |
| 32 | +# Setup |
| 33 | +# ----- |
| 34 | +# |
| 35 | +# First, download the dataset, extract the zip, and note the path to the file `deu.txt`. |
| 36 | +# |
| 37 | +# Ensure that following packages are installed: |
| 38 | +# |
| 39 | +# * `Torchdata 0.6.0 <https://pytorch.org/data/beta/index.html>`_ (`Installation instructions \ |
| 40 | +# <https://github.com/pytorch/data>`__) |
| 41 | +# * `Torchtext 0.15.0 <https://pytorch.org/text/stable/index.html>`_ (`Installation instructions \ |
| 42 | +# <https://github.com/pytorch/text>`__) |
| 43 | +# * `Spacy <https://spacy.io/usage>`__ |
| 44 | +# |
| 45 | +# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to |
| 46 | +# convert a sentence to list of words. Spacy is a python package used for various Natural |
| 47 | +# Language Processing (NLP) tasks. |
| 48 | +# |
| 49 | +# Download the English and German models from Spacy as shown below: |
| 50 | +# |
| 51 | +# .. code-block:: shell |
| 52 | +# |
| 53 | +# python -m spacy download en_core_web_sm |
| 54 | +# python -m spacy download de_core_news_sm |
| 55 | +# |
| 56 | + |
| 57 | + |
| 58 | +# %% |
| 59 | +# Let us start by importing required modules: |
| 60 | + |
| 61 | +import torchdata.datapipes as dp |
| 62 | +import torchtext.transforms as T |
| 63 | +import spacy |
| 64 | +from torchtext.vocab import build_vocab_from_iterator |
| 65 | +eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text |
| 66 | +de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text |
| 67 | + |
| 68 | +# %% |
| 69 | +# Now we will load the dataset |
| 70 | + |
| 71 | +FILE_PATH = 'data/deu.txt' |
| 72 | +data_pipe = dp.iter.IterableWrapper([FILE_PATH]) |
| 73 | +data_pipe = dp.iter.FileOpener(data_pipe, mode='rb') |
| 74 | +data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) |
| 75 | + |
| 76 | +# %% |
| 77 | +# In the above code block, we are doing following things: |
| 78 | +# |
| 79 | +# 1. At line 2, we are creating an iterable of filenames |
| 80 | +# 2. At line 3, we pass the iterable to `FileOpener` which then |
| 81 | +# opens the file in read mode |
| 82 | +# 3. At line 4, we call a function to parse the file, which |
| 83 | +# again returns an iterable of tuples representing each rows |
| 84 | +# of the tab-delimited file |
| 85 | +# |
| 86 | +# DataPipes can be thought of something like a dataset object, on which |
| 87 | +# we can perform various operations. |
| 88 | +# Check `this tutorial <https://pytorch.org/data/beta/dp_tutorial.html>`_ for more details on |
| 89 | +# DataPipes. |
| 90 | +# |
| 91 | +# We can verify if the iterable has the pair of sentences as shown |
| 92 | +# below: |
| 93 | + |
| 94 | +for sample in data_pipe: |
| 95 | + print(sample) |
| 96 | + break |
| 97 | + |
| 98 | +# %% |
| 99 | +# Note that we also have attribution details along with pair of sentences. We will |
| 100 | +# write a small function to remove the attribution details: |
| 101 | + |
| 102 | +def removeAttribution(row): |
| 103 | + """ |
| 104 | + Function to keep the first two elements in a tuple |
| 105 | + """ |
| 106 | + return row[:2] |
| 107 | +data_pipe = data_pipe.map(removeAttribution) |
| 108 | + |
| 109 | +# %% |
| 110 | +# The `map` function at line 6 in above code block can be used to apply some function |
| 111 | +# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains |
| 112 | +# pair of sentences. |
| 113 | + |
| 114 | + |
| 115 | +for sample in data_pipe: |
| 116 | + print(sample) |
| 117 | + break |
| 118 | + |
| 119 | +# %% |
| 120 | +# Now, let us define few functions to perform tokenization: |
| 121 | + |
| 122 | +def engTokenize(text): |
| 123 | + """ |
| 124 | + Tokenize an English text and return a list of tokens |
| 125 | + """ |
| 126 | + return [token.text for token in eng.tokenizer(text)] |
| 127 | + |
| 128 | +def deTokenize(text): |
| 129 | + """ |
| 130 | + Tokenize a German text and return a list of tokens |
| 131 | + """ |
| 132 | + return [token.text for token in de.tokenizer(text)] |
| 133 | + |
| 134 | +# %% |
| 135 | +# Above function accepts a text and returns a list of words |
| 136 | +# as shown below: |
| 137 | + |
| 138 | +print(engTokenize("Have a good day!!!")) |
| 139 | +print(deTokenize("Haben Sie einen guten Tag!!!")) |
| 140 | + |
| 141 | +# %% |
| 142 | +# Building the vocabulary |
| 143 | +# ----------------------- |
| 144 | +# Let us consider an English sentence as the source and a German sentence as the target. |
| 145 | +# |
| 146 | +# Vocabulary can be considered as the set of unique words we have in the dataset. |
| 147 | +# We will build vocabulary for both our source and target now. |
| 148 | +# |
| 149 | +# Let us define a function to get tokens from elements of tuples in the iterator. |
| 150 | + |
| 151 | + |
| 152 | +def getTokens(data_iter, place): |
| 153 | + """ |
| 154 | + Function to yield tokens from an iterator. Since, our iterator contains |
| 155 | + tuple of sentences (source and target), `place` parameters defines for which |
| 156 | + index to return the tokens for. `place=0` for source and `place=1` for target |
| 157 | + """ |
| 158 | + for english, german in data_iter: |
| 159 | + if place == 0: |
| 160 | + yield engTokenize(english) |
| 161 | + else: |
| 162 | + yield deTokenize(german) |
| 163 | + |
| 164 | +# %% |
| 165 | +# Now, we will build vocabulary for source: |
| 166 | + |
| 167 | +source_vocab = build_vocab_from_iterator( |
| 168 | + getTokens(data_pipe,0), |
| 169 | + min_freq=2, |
| 170 | + specials= ['<pad>', '<sos>', '<eos>', '<unk>'], |
| 171 | + special_first=True |
| 172 | +) |
| 173 | +source_vocab.set_default_index(source_vocab['<unk>']) |
| 174 | + |
| 175 | +# %% |
| 176 | +# The code above, builds the vocabulary from the iterator. In the above code block: |
| 177 | +# |
| 178 | +# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for |
| 179 | +# source sentences. |
| 180 | +# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs |
| 181 | +# less than 2 times. |
| 182 | +# * At line 4, we specify some special tokens: |
| 183 | +# |
| 184 | +# * `<sos>` for start of sentence |
| 185 | +# * `<eos>` for end of sentence |
| 186 | +# * `<unk>` for unknown words. An example of unknown word is the one skipped because of |
| 187 | +# `min_freq=2`. |
| 188 | +# * `<pad>` is the padding token. While training, a model we mostly train in batches. In a |
| 189 | +# batch, there can be sentences of different length. So, we pad the shorter sentences with |
| 190 | +# `<pad>` token to make length of all sequences in the batch equal. |
| 191 | +# |
| 192 | +# * At line 5, we set `special_first=True`. Which means `<pad>` will get index 0, `<sos>` index 1, |
| 193 | +# `<eos>` index 2, and <unk> will get index 3 in the vocabulary. |
| 194 | +# * At line 7, we set default index as index of `<unk>`. That means if some word is not in |
| 195 | +# vocabulary, we will use `<unk>` instead of that unknown word. |
| 196 | +# |
| 197 | +# Similarly, we will build vocabulary for target sentences: |
| 198 | + |
| 199 | +target_vocab = build_vocab_from_iterator( |
| 200 | + getTokens(data_pipe,1), |
| 201 | + min_freq=2, |
| 202 | + specials= ['<pad>', '<sos>', '<eos>', '<unk>'], |
| 203 | + special_first=True |
| 204 | +) |
| 205 | +target_vocab.set_default_index(target_vocab['<unk>']) |
| 206 | + |
| 207 | +# %% |
| 208 | +# Note that the example above shows how can we add special tokens to our vocabulary. The |
| 209 | +# special tokens may change based on the requirements. |
| 210 | +# |
| 211 | +# Now, we can verify that special tokens are placed at the beginning and then other words. |
| 212 | +# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on |
| 213 | +# vocabulary. |
| 214 | + |
| 215 | +print(source_vocab.get_itos()[:9]) |
| 216 | + |
| 217 | +# %% |
| 218 | +# Numericalize sentences using vocabulary |
| 219 | +# --------------------------------------- |
| 220 | +# After building the vocabulary, we need to convert our sentences to corresponding indices. |
| 221 | +# Let us define some functions for this: |
| 222 | + |
| 223 | +def getTransform(vocab): |
| 224 | + """ |
| 225 | + Create transforms based on given vocabulary. The returned transform is applied to sequence |
| 226 | + of tokens. |
| 227 | + """ |
| 228 | + text_tranform = T.Sequential( |
| 229 | + ## converts the sentences to indices based on given vocabulary |
| 230 | + T.VocabTransform(vocab=vocab), |
| 231 | + ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is |
| 232 | + # 1 as seen in previous section |
| 233 | + T.AddToken(1, begin=True), |
| 234 | + ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is |
| 235 | + # 2 as seen in previous section |
| 236 | + T.AddToken(2, begin=False) |
| 237 | + ) |
| 238 | + return text_tranform |
| 239 | + |
| 240 | +# %% |
| 241 | +# Now, let us see how to use the above function. The function returns an object of `Transforms` |
| 242 | +# which we will use on our sentence. Let us take a random sentence and check how the transform |
| 243 | +# works. |
| 244 | + |
| 245 | +temp_list = list(data_pipe) |
| 246 | +some_sentence = temp_list[798][0] |
| 247 | +print("Some sentence=", end="") |
| 248 | +print(some_sentence) |
| 249 | +transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence)) |
| 250 | +print("Transformed sentence=", end="") |
| 251 | +print(transformed_sentence) |
| 252 | +index_to_string = source_vocab.get_itos() |
| 253 | +for index in transformed_sentence: |
| 254 | + print(index_to_string[index], end=" ") |
| 255 | + |
| 256 | +# %% |
| 257 | +# In the above code,: |
| 258 | +# |
| 259 | +# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1 |
| 260 | +# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized |
| 261 | +# sentence. Note that transforms take list of words and not a sentence. |
| 262 | +# * At line 8, we get the mapping of index to string and then use it get the transformed |
| 263 | +# sentence |
| 264 | +# |
| 265 | +# Now we will use DataPipe functions to apply transform to all our sentences. |
| 266 | +# Let us define some more functions for this. |
| 267 | + |
| 268 | +def applyTransform(sequence_pair): |
| 269 | + """ |
| 270 | + Apply transforms to sequence of tokens in a sequence pair |
| 271 | + """ |
| 272 | + |
| 273 | + return ( |
| 274 | + getTransform(source_vocab)(engTokenize(sequence_pair[0])), |
| 275 | + getTransform(target_vocab)(deTokenize(sequence_pair[1])) |
| 276 | + ) |
| 277 | +data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator |
| 278 | +temp_list = list(data_pipe) |
| 279 | +print(temp_list[0]) |
| 280 | + |
| 281 | +# %% |
| 282 | +# Make batches (with bucket batch) |
| 283 | +# -------------------------------- |
| 284 | +# Generally, we train models in batches. While working for sequence to sequence models, it is |
| 285 | +# recommended to keep the length of sequences in a batch similar. For that we will use |
| 286 | +# `bucketbatch` function of `data_pipe`. |
| 287 | +# |
| 288 | +# Let us define some functions that will be used by the `bucketbatch` function. |
| 289 | + |
| 290 | +def sortBucket(bucket): |
| 291 | + """ |
| 292 | + Function to sort a given bucket. Here, we want to sort based on the length of |
| 293 | + source and target sequence. |
| 294 | + """ |
| 295 | + return sorted(bucket, key=lambda x: (len(x[0]), len(x[1]))) |
| 296 | + |
| 297 | +# %% |
| 298 | +# Now, we will apply the `bucketbatch` function: |
| 299 | + |
| 300 | +data_pipe = data_pipe.bucketbatch( |
| 301 | + batch_size = 4, batch_num=5, bucket_num=1, |
| 302 | + use_in_batch_shuffle=False, sort_key=sortBucket |
| 303 | +) |
| 304 | + |
| 305 | +# %% |
| 306 | +# In the above code block: |
| 307 | +# |
| 308 | +# * We keep batch size = 4. |
| 309 | +# * `batch_num` is the number of batches to keep in a bucket |
| 310 | +# * `bucket_num` is the number of buckets to keep in a pool for shuffling |
| 311 | +# * `sort_key` specifies the function that takes a bucket and sorts it |
| 312 | +# |
| 313 | +# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`. |
| 314 | +# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`. |
| 315 | +# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: |
| 316 | + |
| 317 | +print(list(data_pipe)[0]) |
| 318 | +# %% |
| 319 | +# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`. |
| 320 | +# For this we will write a small function: |
| 321 | + |
| 322 | +def separateSourceTarget(sequence_pairs): |
| 323 | + """ |
| 324 | + input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]` |
| 325 | + output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))` |
| 326 | + """ |
| 327 | + sources,targets = zip(*sequence_pairs) |
| 328 | + return sources,targets |
| 329 | + |
| 330 | +## Apply the function to each element in the iterator |
| 331 | +data_pipe = data_pipe.map(separateSourceTarget) |
| 332 | +print(list(data_pipe)[0]) |
| 333 | + |
| 334 | +# %% |
| 335 | +# Now, we have the data as desired. |
| 336 | +# |
| 337 | +# Padding |
| 338 | +# ------- |
| 339 | +# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to |
| 340 | +# make all the sequences in a batch of equal length. We can perform padding as follows: |
| 341 | + |
| 342 | +def applyPadding(pair_of_sequences): |
| 343 | + """ |
| 344 | + Convert sequences to tensors and apply padding |
| 345 | + """ |
| 346 | + return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1]))) |
| 347 | +## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies |
| 348 | +# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the |
| 349 | +# vocabulary. |
| 350 | +data_pipe = data_pipe.map(applyPadding) |
| 351 | + |
| 352 | +# %% |
| 353 | +# Now, we can use the index to string mapping to see how the sequence would look with tokens |
| 354 | +# instead of indices: |
| 355 | + |
| 356 | +source_index_to_string = source_vocab.get_itos() |
| 357 | +target_index_to_string = target_vocab.get_itos() |
| 358 | + |
| 359 | +def showSomeTransformedSentences(data_pipe): |
| 360 | + """ |
| 361 | + Function to show how the sentences look like after applying all transforms. |
| 362 | + Here we try to print actual words instead of corresponding index |
| 363 | + """ |
| 364 | + for sources,targets in data_pipe: |
| 365 | + if sources[0][-1] != 0: |
| 366 | + continue # Just to visualize padding of shorter sentences |
| 367 | + for i in range(4): |
| 368 | + source = "" |
| 369 | + for token in sources[i]: |
| 370 | + source += " " + source_index_to_string[token] |
| 371 | + target = "" |
| 372 | + for token in targets[i]: |
| 373 | + target += " " + target_index_to_string[token] |
| 374 | + print(f"Source: {source}") |
| 375 | + print(f"Traget: {target}") |
| 376 | + break |
| 377 | + |
| 378 | +showSomeTransformedSentences(data_pipe) |
| 379 | +# %% |
| 380 | +# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we |
| 381 | +# can use `data_pipe` while writing our training function. |
| 382 | +# |
| 383 | +# Some parts of this tutorial was inspired from `this article |
| 384 | +# <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__. |
0 commit comments