From f24a95983110a25d1c4237e95b90907ff0ea3589 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:16:55 +0530 Subject: [PATCH 01/31] added intro --- .../torchtext_custom_dataset_tutorial.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 beginner_source/torchtext_custom_dataset_tutorial.py diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py new file mode 100644 index 00000000000..daa6f39a28c --- /dev/null +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" +Preaparing custom text dataset using Torchtext +============================================== + +**Author**: `Anupam Sharma `_ + +This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we +will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence +model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning +with Neural Networks `_) but using Torchtext 0.15.0 instead +of a legacy version. + +In this tutorial, we will learn how to: + +* Read a dataset +* Tokenize sentence +* Apply transforms to sentence +* Perform bucket batching + +Let us assume that we need to prepare a dataset to train a model that can perform English to +Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by +the `Tatoeba Project `_ which can be downloaded from this link: `Click +Here `__ +""" + From a0ac5d5a93fd3b149dd1fe3aecebf4681d5ee7e5 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:18:10 +0530 Subject: [PATCH 02/31] added setup section --- .../torchtext_custom_dataset_tutorial.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index daa6f39a28c..62b7735ba47 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -25,3 +25,27 @@ Here `__ """ +# %% +# Setup +# ----- +# +# First, download the dataset, extract the zip, and note the path to the file `fin.txt`. +# The dataset can be downloaded from this link: `Click Here `__ . +# +# Ensure that following packages are installed: +# +# * `Torchdata 0.6.0 `_ (Installation instructions: `C\ +# lick here `__) +# * `Torchtext 0.15.0 `_ (Installation instructions:\ +# `Click here `__) +# * Spacy (Docs: `Click here `__) +# +# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to +# convert a sentence to list of words. Spacy is a python package used for various Natural +# Language Processing (NLP) tasks. +# +# Download the English and Finnish models from spacy as shown below: :: +# +# python -m spacy download en_core_web_sm +# python -m spacy download fi_core_news_sm From d1a010b74f56928a8853daa86b3bae3c44f91da0 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:20:19 +0530 Subject: [PATCH 03/31] import packages and read dataset --- .../torchtext_custom_dataset_tutorial.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 62b7735ba47..14f88da3511 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -49,3 +49,43 @@ # # python -m spacy download en_core_web_sm # python -m spacy download fi_core_news_sm + + +# %% +# Let us start by importing required modules: + +import torchdata.datapipes as dp +import torchtext.transforms as T +import spacy +from torchtext.vocab import build_vocab_from_iterator +eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing +fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing + +# %% +# Now we will load the dataset + +FILE_PATH = 'fin.txt' +dataPipe = dp.iter.IterableWrapper([FILE_PATH]) +dataPipe = dp.iter.FileOpener(dataPipe, mode='rb') +dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) + +# %% +# In the above code block, we are doing following things: +# +# 1. At line 2, we are creating an iterable of filenames +# 2. At line 3, we pass the iterable to `FileOpener` which then +# opens the file in read mode +# 3. At line 4, we call a function to parse the file, which +# again returns an iterable of tuples representing each rows +# of the tab-delimited file +# +# Data pipes can be thought of something like a dataset object, on which +# we can perform various operations. Check `this tutorial `_ for more details on data pipes. +# +# We can verify if the iterable has the pair of sentences as shown +# below: + +for sample in dataPipe: + print(sample) + break From b8d1dfa27fc63f00ac03007c681b258b3956a891 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:21:36 +0530 Subject: [PATCH 04/31] remove attributions from dataset --- .../torchtext_custom_dataset_tutorial.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 14f88da3511..9cf9441c640 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -89,3 +89,25 @@ for sample in dataPipe: print(sample) break + +# %% +# Note that we also have attribution details along with pair of sentences. We will +# write a small function to remove the attribution details: + +def remove_attribution(row): + """ + Function to keep the first two elements in a tuple + """ + return row[:2] +dataPipe = dataPipe.map(remove_attribution) + +# %% +# The `map` function at line 2 in above code block can be used to apply some function +# on each elements of data pipe. Now, we can verify that the data pipe only contains +# pair of sentences. + + +for sample in dataPipe: + print(sample) + break + From da7d346bcc238b4b0389d30eb0a3272253a53f4f Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:22:46 +0530 Subject: [PATCH 05/31] added functions for tokenization --- .../torchtext_custom_dataset_tutorial.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 9cf9441c640..33da9be38ad 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -111,3 +111,25 @@ def remove_attribution(row): print(sample) break +# %% +# Now, let us define few functions to perform tokenization: + +def eng_tokenize(text): + """ + Tokenize an English text and returns list of tokens + """ + return [token.text for token in eng.tokenizer(text)] + +def fin_tokenize(text): + """ + Tokenize a Finnish text and returns list of tokens + """ + return [token.text for token in fin.tokenizer(text)] + +# %% +# Above function accepts a text and returns a list of words +# as shown below: + +print(eng_tokenize("Have a good day!!!")) +print(fin_tokenize("Hyvää päivänjatkoa!!!")) + From 86f56e10f9b8b5949b3a78eb3009270a3e786015 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:23:48 +0530 Subject: [PATCH 06/31] building the vocabulary --- .../torchtext_custom_dataset_tutorial.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 33da9be38ad..5aa6f8d8bed 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -133,3 +133,69 @@ def fin_tokenize(text): print(eng_tokenize("Have a good day!!!")) print(fin_tokenize("Hyvää päivänjatkoa!!!")) +# %% +# Building the vocabulary +# ----------------------- +# Let us consider an English sentence as the source and a Finnish sentence as the target. +# +# Vocabulary can be considered as the set of unique words we have in the dataset. +# We will build vocabulary for both our source and target now. +# +# Let us define a function to get tokens from elements of tuples in the iterator. +# The comments within the function specifies the need and working of it: + +def get_tokens(data_iter, place): + """ + Function to yield tokens from an iterator. Since, our iterator contains + tuple of sentences (source and target), `place` parameters defines for which + index to return the tokens for. `place=0` for source and `place=1` for target + """ + for english, finnish in data_iter: + if place == 0: + yield eng_tokenize(english) + else: + yield fin_tokenize(finnish) + +# %% +# Now, we will build vocabulary for source: + +sourceVocab = build_vocab_from_iterator( + get_tokens(dataPipe,0), + min_freq=2, + specials= ['', '', '', ''], + special_first=True +) +sourceVocab.set_default_index(sourceVocab['']) + +# %% +# The code above, builds the vocabulary from the iterator. In the above code block: +# +# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for +# source sentences. +# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs +# less than 2 times. +# * At line 4, we specify some special tokens: +# +# * `` for start of sentence +# * `` for end of senetence +# * `` for unknown words. An example of unknown word is the one skipped because of +# `min_freq=2`. +# * `` is the padding token. While training, a model we mostly train in batches. In a +# batch, there can be sentences of different length. So, we pad the shorter sentences with +# `` token to make length of all sequences in the batch equal. +# +# * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1, +# `` index 2, and will get index 3 in the vocabulary. +# * At line 7, we set default index as index of ``. That means if some word is not in +# vocbulary, we will use `` instead of that unknown word. +# +# Similarly, we will build vocabulary for target sentences: + +targetVocab = build_vocab_from_iterator( + get_tokens(dataPipe,1), + min_freq=2, + specials= ['', '', '', ''], + special_first=True +) +targetVocab.set_default_index(targetVocab['']) + From 10e01a819b47f9dd93a5c6bfa27a9442a230776a Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:25:04 +0530 Subject: [PATCH 07/31] added some comments --- beginner_source/torchtext_custom_dataset_tutorial.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 5aa6f8d8bed..004481403d9 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -199,3 +199,13 @@ def get_tokens(data_iter, place): ) targetVocab.set_default_index(targetVocab['']) +# %% +# Note that the example above shows how can we add special tokens to our vocabulary. The +# special tokens may change based on the requirements. +# +# Now, we can verify that special tokens are placed at the beginning and then other words. +# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on +# vocabulary. + +print(sourceVocab.get_itos()[:9]) + From 7dfa29f3db08949e2dd9400e3310fe342b38f29d Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:25:58 +0530 Subject: [PATCH 08/31] Numericalize sentences using vocabulary --- .../torchtext_custom_dataset_tutorial.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 004481403d9..beab7ee8641 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -209,3 +209,67 @@ def get_tokens(data_iter, place): print(sourceVocab.get_itos()[:9]) +# %% +# Numericalize sentences using vocabulary +# --------------------------------------- +# After building the vocabulary, we need to convert our sentences to corresponding indices. +# Let us define some functions for this: + +def get_transform(vocab): + """ + Create transforms based on given vocabulary. The returned transform is applied to sequence + of tokens. + """ + text_tranform = T.Sequential( + ## converts the sentences to indices based on given vocabulary + T.VocabTransform(vocab=vocab), + ## Add at beginning of each sentence. 1 because the index for in vocabulary is + # 1 as seen in previous section + T.AddToken(1, begin=True), + ## Add at beginning of each sentence. 2 because the index for in vocabulary is + # 2 as seen in previous section + T.AddToken(2, begin=False) + ) + return text_tranform + +# %% +# Now, let us see how to use the above function. The function returns an object of `Transforms` +# which we will use on our sentence. Let us take a random sentence and check the working of +# the transform: + +tempList = list(dataPipe) +someSetence = tempList[798][0] +print("Some sentence=", end="") +print(someSetence) +transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence)) +print("Transformed sentence=", end="") +print(transformedSentence) +indexToString = sourceVocab.get_itos() +for index in transformedSentence: + print(indexToString[index], end=" ") + +# %% +# In the above code,: +# +# * At line 2, we take a source setence from list that we created from dataPipe at line 1 +# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized +# sentence. Note that transforms take list of words and not a sentence. +# * At line 8, we get the mapping of index to string and then use it get the transformed +# sentence +# +# Now we will use functions of `dataPipe` to apply transform to all our sentences. +# Let us define some more functions for this. + +def apply_transform(sequence_pair): + """ + Apply transforms to sequence of tokens in a sequence pair + """ + + return ( + get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])), + get_transform(targetVocab)(fin_tokenize(sequence_pair[1])) + ) +dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator +tempList = list(dataPipe) +print(tempList[0]) + From d9dea745c6261c3a87d2ab3cb6e1795f68a73ac7 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:27:06 +0530 Subject: [PATCH 09/31] bucket batching --- .../torchtext_custom_dataset_tutorial.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index beab7ee8641..1e7314d1013 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -273,3 +273,56 @@ def apply_transform(sequence_pair): tempList = list(dataPipe) print(tempList[0]) +# %% +# Make batches (with bucket batch) +# -------------------------------- +# Generally, we train models in batches. While working for sequence to sequence models, it is +# recommended to keep the length of sequences in a batch similar. For that we will use +# `bucketbatch` function of `dataPipe`. +# +# Let us define some functions that will be used by the `bucketbatch` function. + +def sort_bucket(bucket): + """ + Function to sort a given bucket. Here, we want to sort based on the length of + source and target sequence. + """ + return sorted(bucket, key=lambda x: (len(x[0]), len(x[1]))) + +# %% +# Now, we will apply the `bucketbatch` function: + +dataPipe = dataPipe.bucketbatch( + batch_size = 4, batch_num=5, bucket_num=1, + use_in_batch_shuffle=False, sort_key=sort_bucket +) + +# %% +# In the above code block: +# +# * We keep batch size = 4. +# * `batch_num` is the number of batches to keep in a bucket +# * `bucket_num` is the number of buckets to keep in a pool for shuffling +# * `sort_key` specifies the function that takes a bucket and sorts it +# +# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`. +# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`. +# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: + +print(list(dataPipe)[0]) +# %% +# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`. +# For this we will write a small function: + +def separate_source_target(sequence_pairs): + """ + input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]` + output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))` + """ + sources,targets = zip(*sequence_pairs) + return sources,targets + +## Apply the function to each element in the iterator +dataPipe = dataPipe.map(separate_source_target) +print(list(dataPipe)[0]) + From 2dd7f197fd9cdc5e30b6cee3be0250d8cf284da9 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:27:57 +0530 Subject: [PATCH 10/31] applied padding --- .../torchtext_custom_dataset_tutorial.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 1e7314d1013..a6d42ea999f 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -326,3 +326,21 @@ def separate_source_target(sequence_pairs): dataPipe = dataPipe.map(separate_source_target) print(list(dataPipe)[0]) +# %% +# Now, we have the data as desired. +# +# Padding +# ------- +# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to +# make all the sequences in a batch of equal length. We can perform padding as follows: + +def apply_padding(pair_of_sequences): + """ + Convert sequnces to tensors and apply padding + """ + return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1]))) +## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies +# padding. Here, `0` is passed to the constructor to specify the index of the `` token in the +# vocabulary. +dataPipe = dataPipe.map(apply_padding) + From 1fec4b5aa0cc4b6565417420e76e6c7c02e05735 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:31:18 +0530 Subject: [PATCH 11/31] view the final result --- .../torchtext_custom_dataset_tutorial.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index a6d42ea999f..9e0a3ed1c6e 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -344,3 +344,31 @@ def apply_padding(pair_of_sequences): # vocabulary. dataPipe = dataPipe.map(apply_padding) +# %% +# Now, we can use the index to string mapping to see how the sequence would look with tokens +# instead of indices: + +sourceItoS = sourceVocab.get_itos() +targetItoS = targetVocab.get_itos() + +def show_some_transformed_senetnces(data_pipe): + """ + Function to show how the senetnces look like after applying all transforms. + Here we try to print actual words instead of corresponding index + """ + for sources,targets in data_pipe: + if sources[0][-1] != 0: + continue # Just to visualize padding of shorter sentences + for i in range(4): + source = "" + for token in sources[i]: + source += " " + sourceItoS[token] + target = "" + for token in targets[i]: + target += " " + targetItoS[token] + print(f"Source: {source}") + print(f"Traget: {target}") + break +# %% +# In the above output we can observe that the shorter sentences are padded with ``. Now, we can +# use this dataPipe while writing our training function. From 158c5fd2ad40028dea67a48152ffbe6927729c16 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:39:16 +0530 Subject: [PATCH 12/31] added torchtext logo --- .../img/thumbnails/cropped/torch_text_logo.png | Bin 0 -> 8545 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 _static/img/thumbnails/cropped/torch_text_logo.png diff --git a/_static/img/thumbnails/cropped/torch_text_logo.png b/_static/img/thumbnails/cropped/torch_text_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..3fe736d60e282d080ac3165e3a5aad3c69bcc934 GIT binary patch literal 8545 zcmeHtc~n!$mw!M34GpNYD1tzD2hxoo5kzDW1UC}4Hj6AG1X+}@8e}JcqSy)wz6gX} zW$PCZ5M|%NEsbn#g2-Y-q9_=`A|MINRQ%4&`OTU0n}6oaobMm=IIrGIRo%K(x2o>_ zeCl!icl#4^$X!ST0wIUBK8i;mB%Q?fCTWn<;=>pLkB#Rn?JN<9hbgiv9#Y`9hNm^& z4uOc$LLlO=AP_>375@o=2*n@}lVk+Kgn>Zp2+nCZX$A_U&)S?gidYx_-EA(RgB+QQ zSi55~6B~b&S2$=*31lG<>gTXWEr}6B{88VK(-WCm?^IkL2ca#&${))!IlmlEM)^fz5l=!%xsuj5 zc=r%nYx9M3GZO;XyXi}!+K|P9q*vw^ zlty;0{QhYERO?KO=d~x{kz$mH~at3($qvRWtoshyUoA6t zqNVY0kOO_?VPkVTl05au( zMJ?Mw7Mjjobw;v-o{~!QWdfTE^(IL^sYu5z_`{qctbGcf#vF#@L(4^vOtCH2x+><@ z3sipDnS-r#C+#cE8f1x=wJeWDEa`ENmulN7dedzl9TjJ1*ilhbew5gK`Owc~2~`~% zm^lDXFZx4E^q3r;BQ`bz>5>iOlqzL5>cl6U`;Ejh$>Ck3B(uhdMWx(jGNWNAn8xo| z_HcFHWVhJ~A;axw|Nb_O=sgvz{kv{Sxqg$1!K1ssNyxr?r&1`avPT(bA_-HY&lBNi z->$L7=c|a1-hB|uxpOzRLP!^`-Bl(h`P}nS>#I66=f3!OsY)mLl9~*YJf@FksNonD z())#MsVJ$`cGejyRjLU8HPFo7eif^`W6>t@myoLOjU~Z?E|Yv0ZL^`~px?&j3c6XB zGf-p+opJ(94mDYbs?3~SB&2-wdT{2T#UK0RXRN^3o^pD-Ofo1jtr@dk-%e7lbVgAW zj=YoFznQk*U=D9##@f$tjPbbp#=(51vJ&7cxcL8xUc6ACSij%&62+AJ>6{9ZsD7tOM% z9{8Kn1;olBYonI-9G!2`pg{)%Yr9B`Y+eG9F2XTqX?JF&`#*ZMi#y;x@jDbQ?m9uD zl&c}6FdbwWU~SBVejg8^LPY^@a|L_GyDm3yxiX*b4W}Lx_o7KCcR!ASJaa<-t%fk_ zCF}ADU;?)%*V!#s&*5#Z4J!w2p=nAgp|(~cBKCE`%FaPr%b0N*QnY7tg{HH6V{q(F zN2RJZvCuQBw%4i-Cp+}l6d$(NGFwn@J8{wh++v+e$C2zS%p`kBg63tRn4%l%2# zpopYXa(UnyoEmy?T<(5FiB)=wj?%4oskTxRh-V5*e^XR;h<3jrci;JUFu)3mCNO(R z2d6pt2&dBY06wJ3e@nO3mP*8V?^IqjYJy-VpodfiK9TO%*&-pk*z<|gx>432!&i*J zmO!dD3*NdVrLLjZKyy1Kl^z@6>>jsd2Z~Z(3X(KM=lXW9=D_1{;%wxjJ50Cc+d~>c zw`X>O0x%U(K#c^NY#D8e9e`r8T$GfH-S&?;3xh*ZEF-9ij?|vt)#C^?<$4*CI4-R5 zkSd`>QLx5lr@ptC?Uic`r1R<7pqmx7caZ96s3`-fbq(JYK$AM3r-L@mMLM3qlS&t+ zJy&ze7Et@pNeM6XXy=0%yi$}pdO3h8@XFh>^~(d6CeM=MJA72M z_IX;X$XWDk$&%UOmvCMlB_TUBjBUomLOct6f|A0Cw)a!~Ldh2&h}P~lS5p)-x$$Yl z^D7#pZj&sbgSEO&H|zc3ZIySMPf|XQ`#_}^ihk_%0GobonmW1sD2=kwB5UZD4|}xS zXRSj>rNHxi;<2SbZ7(_ey{guZ;6c3|!ljNT#Y%*ajm4ky`0zNx?N-x;H3gA-=0%Os zzg3Sc(~0j)axRRE7N#e{LVuF!Xj=P4>foP33(|71L;teZVImPyp~Q^{iGo$P$UnBWlE}whW8)3Dloau>O zMPM5q*Gw`7%lTzFpQz{V&cbQOtn9De-Q6TzR#;v}s}(8Rdyk}4=S-eyGUY2?ar{YK zHMW5FAbG<8W}#7ZQ9CE+MNALN1-KqM{ClWH_0hrgRjnef2@1T~c)43bmMm}PVLxEc zl$oL~t+iOjFm}bwM14G$-1(=eAR1ziTQ3b1Jyo?PycTZqJBf8BX~Nxc_ZxFL&zAVa^}#e^ zE2pAyd_1`Vx>ElOn2pwx+R3&(tX+j!V;0sRB7aM8{##FWabmS=`da{#x^P!yP>``I zjj;okZLiE@dMyyRwGB(VaeR@HpM;<3_WR{p%LK?BwVXj5Y_OP}PzwSmMYD?7_;(f1in(Fb`>>@6VNv^ST`J>*Sc$&v z)jPRFhBJ;-wPpjU1Kjd?(NK<%R_ce1rSm4>XN$da5l0%C+e7TyBo8`;;V)4P#*Lka z!SXTTe2u*y9o?ERaEhr{=B~6tg4Q#56>xC%{G9tPpQL!HV1%u!^PppFST9&}3my+j zYBZc}&+;tJh2GG5HqtTw+(=(J_30%L zXjNlYrTeqt8+d$IVm3?}$b8Jzh%&8KjnpO83u{s9=<#4prwez*VV|w|(QASJO0OjS zV>&^?FExw4;NnKJKjmKEkiDxpcps;4t&x3YaWq8S6W}WmN#3T_mS4l3kqTw7Yx`I! z^OF)&SQR6Oz_<;vmTi(wI%3tzGG!i=A<<{{@qV;^;ElMgr{ToK*dFmxa@bn#gaPv= z$zae8m#1YSV-h}nU~hXL4mD*WPxlt&hb)N4M1tZR=G2u1y5F37Id4+-PPV^43>=u! z>*H$qbWTuP(;Z4o+}n7=`U~^wgzN+BKOYG6xdgf=>*Y8e?H<{XjXbgny_;ok`8x=0 zoOVtD#h-Ac-9|dL+6IiQS3nb+U8Hiq7O47*9-g zv8&#-7Itt-VBJWv2WU8Xjwg3N96$fi{)1q`E1*Q1_?i$>zILcu=bP`OcE{M`xL+As zV3%4Rnvm{4f%ljk2HX7Rg+XjU$h4$3^nv|x*K9|Bb`cN^9SSDz`}Hy^&F<_p+9D2XLbTbCC{=?NTk z8s0KsFo84yqA6w`37`1HKN@l-n|BqZD+GO<<%ewxDSx*W$W<8FE}RLt7!*{feR|dU z+K$mjTjb5Y=i|C-T=FAKx0bzwf%WpvQ|p6Y>&_UayRF(F5EuX@5a(_G8;QU75E}8N z0`Lb>V*)#_>J+4kqw28wr}_PldqXVcpf9M%RGvL_?Xz0aBlR;mi}ezEhubbcHMEv0 z9mn=)M(VLl;k#*U*+Zgpsb0pc<4`Z~8?2&DjhKv$I!>+$s8X!G*77MRUuzi)y~gWJ zcKYVOyX=fRq_zaD=(94gEsRC1gx(3zCdbiM9t3|LAH{VbJqr7o%&#pX9-9q=Kof;( zzmVlOJ5>$}nXG;>so-@J+lKNMS-Ol5#J0%|f_s~sFty(`Caa8!QbT`c-JTwgid%G4 zFShrIc_@J?S=k^htyr6YZpvS+7F|GmOf`0j$8A3h_yyvhEzzlIe za@R8E(EQI0q3g)fFqfqO0gi*)sKYodsiae$OKfwonZs-^kG(xBkkXqLmAV)2)~e4y zwpi_b2^fshOW+3uVK{esXIO0k%2?F@^w8?ZMzzkh*(vqTqU)Y>4^{0{8+%3l^$sJ`%e@jZnekFd z0miI*s1x)Yd#XN*qUGilc|R95j6DF2*icPb3BTn$*I0vYRNC(2X|gTA`E4qnbc zliS}VtqSUm;E8tG5_61Bz< zn7nW*&w_l$;c91)R(%%Fhsg|W6@{pPb`=yYW!)0N;QgvDFGN8Ct>x!L!%{)-*acc! z6a%RpXLI#-tCt}9bvoYWQVG3@9@W&6izH3pNoBE-GhW)Rc6+zV)BQirYIc|cwt0xe z5}L7!Q0|3R854*mfV`E4sZb}pgCC^1j3M=ZiVlky!803{Fr;c1dnu^EcV}go;lq>$ zK{Se$NvsoI0PtxJP(ZZtaBe3)Z-i(5rtPiKV+BIWtdF)1ILuJM#&TCP8%LiX7ti>_ zy)3C-tk`w9U#iJrUR1Rw9TIY7uNBiS8~6iwx!XZ1|HGjbH(DKV7$R&>_;Urq_OjRb zl;P5x&d<})s#)J6<%_gk+`aE_R*Nu~R7#DQjd7+Kp08M2)5?(II?P!>E43&=(Pa*c zh>7K7CYHiQV@F0`o4ui}a>JhQFHgNvuSJ&V;3TQ=P2Aonrc2S0W)O7k+ND{aHpyw&?Wo)lBIXCL`%zq*IWA#rQU?C(Z@0B+b7->D24K!u z?XR>fN}^-;OlC0ZgpR=5Y&ew|>uv8-jX7?O%} z0C8S7f%r=JhW%@zprm{{##Hp^=E=C(P8NI9AKk>hhRj{u;62+Lv@-NBodL%HZ>sJf z9W8LWvazXILDk&oqw);`Q7!<>JFshv2Ih<#_8Z<9 zdvVA9VNex!vS*ynej>~zzR>5Z=UIt6PzoalM6mQyt$uSn$25zl0#k+_uxm^eJa_7v zrNv8ORD^lOoyrdW;KZP2F*-81HjRDvl>%XWR|_scNzUw8sj@brDJ%AuiCD#nM{ub$ zy0$4ZJ#E^xC6_lkz0wrhX7DF4O5z7Le&jPdvAgt@`Mu3Ha4C{&tGit-C0?qRr@pDp z-wAt_0i>6FeDz)wu|NIsx}o6n$ARE209`!t0Hpb?^;Je^X6}|(K?iBt+a*(;Ov#wh zHLc>`RA#xT#TbyDW-X>A(1mxy%S?TIm$&&q>z5EfX|l?QgU^8@FDaDIRO3g4GYOdh z0=`VN)UGS@X0&>%P&FwODCd@!-$Oc7ysGTgV6ei)!xGr+evEbGc<#7$?JH z!y7$A5_{x)Amn$+s~OeAzM-7O&31!W-P3Pv0~S{>>6^zg#Z!y1U2q@$s}8q+Png(+ z9RkOEW~9n@%HpyRM+G8~Z7u9uP*9My7 zhF-m@Z441MnFbbF?U(`HtTo_iYUJ2-bIuFl0VJnsg3jhK*A?9M- zcgBn%=?eWQuO<`$JPRmz#7zXqbPw=Hk(BE*fg5!q#2Bt7880#A0Mpjr+Rlvj1!x`_eOkXR<~~S7r>wUH)4pTO8~nEm%wVv@%6lvvalXFf3>k+Cbd!+SmB*>FbC7 zb>#ZE1&>jzVZ-`fRwi4OsX(APNAIpY4r_Bb|2RHar_3>*_4Br&XAT1Wi(jMHMr}zc z2kpW7kqC{_DNOzE^Ddf8edjTOQ&5 zCFw;`!>`QSl^@@kefE56^kHVRO2Zq+bm7U(!`PO;zTsLPpt!i3AqT6bLS%ZsWvvFN zm%S#)eF2>G!PO2?RRLKLQ z(d-KMy{?VHE$dmsFyA*(Stjo$5fO!E8R1h#AF|V~oxmqxMMa(FwgRuFAHQM7QzHyQ zIq%VM>H?P8zS_=PM)W!Ww_Hu1~eNH*q9wj1BM@HFB1LYG%(a>nMM5@E)oP*Cc=w<(|LbH;ALrujB68!qaoZ4_0A zkCmKkl=U_Qh31P$OUsT>O#f(W8q!^5Yx!8(cW9|SFv?C6JZKVb_e~;Kg$ERvX33PD zu?b0uDrqSF0lmj%l@sSk;FQv_w~9Nc>-F@mgm6O2LN{A2A-rKp!C-5-j$B8Vc(VDt z*&SS&3x;F!pE`T&i-#))+`a44kamuQ*4LtbIhvx#%TwfKSZYzzxCFac0i>tHU`5&} z*3#O}%G;VH(%OcjmMf&cTwNXPMIy)m+#(P+f#>?CNgmjWHpZ`3hmFO=h4<~Y!2-B3 zd8-Ij;G2(@_3Il2m>}$V7?jJNr`|!GvuHTTpTBZ$TP68|9HeSb{h6hLed{o14|$G0 z4+pVYr!dz0gFVzs2aco^r2zTcig@oL!Iadc1J~dW3DC>=*fl>W1P(R?&aNs|___#q zCqq^*q4z1DXlN6fR<8k|(XPT+8CAQhWq?^630E)X+d_I!Q+m{`z2}-@HW@h3O#!Yh z-{8aI&Hp`5um73cepaO@eS=8)EJJg3e*R>L=}TD&S;XCa|6IvVU}}W?E(zJ4cH)<; zG~av##<1YJrzy(^x`s`}hBYj;$DX!3gjO%kqp!~Q$8BWn5pQsF*8o&~wwLZg^u0KX zFSX|6QCq6s!icKRO1=a`R$Rk_i3>JjaZvf8OD9I3lUo-g)+x{wb{hQ#M|wyO6mPZT z@Q(6mc*i3Pl(Pf1O2pW)zu3cgVWlyRM<>3B+OnPWFoIr&f0q&q?boUgzw&xcWh9-T-**4d$mE`uE)lX;R!sm{4$?iOS zXbfF=*FzHcwa6*}m%U(H%7i#%A!1$sf%M3;%(hTuietcutlMm8?6J1< zD)gZlwS!$0+g-PB2 z=9BqN?NjQNiSP$Yn=1%e9BBO}zU#`cvP`LS-HqVB+~F|hU*d1f|9AcGSfDLOxSpKS zJLmfYX%1*S>+OW~!D6)$o4}6^h;0%Z5E9@`eD9NxL~I35eQ^6;+xR{2D0mlNg!4fc z!@d3eH86V(Fnjmy2mdO7ki5J3dz}xUe8d0qF0LQ3Y3pwA(7bre>7tj%MQ;;NsyBEb z^fCGdx|sdC`WWKA{U-YiObqq+U@#^a%-y!)<^LcM5ai|S6Z!ua=+9L<1_ZW=HF!t( z`j{L8iGz<%i1$UrfrEX$I3G~D^WRH<_l@xOcRK1D;2jv^8x)Ap*Ed$kLv8!db)5f0 p9q9H`FHlGA-`gPs`CJV5pnCr&i!>Oa--DV6>@oYJ Date: Thu, 4 May 2023 19:45:19 +0530 Subject: [PATCH 13/31] added card in index.rst --- index.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/index.rst b/index.rst index 48550e6c4ac..ecb64a1aa6e 100644 --- a/index.rst +++ b/index.rst @@ -264,6 +264,13 @@ What's new in PyTorch tutorials? :link: beginner/translation_transformer.html :tags: Text +.. customcarditem:: + :header: Preaparing custom text dataset using Torchtext + :card_description: Learn how to use torchtext to prepare a custom dataset + :image: _static/img/thumbnails/cropped/torch_text_logo.png + :link: beginner/torchtext_custom_dataset_tutorial.html + :tags: Text + .. Reinforcement Learning From 09dd73b4ae63bb4b3f77942b5453476fb5f50dc7 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 19:50:05 +0530 Subject: [PATCH 14/31] added entry in toctree --- index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/index.rst b/index.rst index ecb64a1aa6e..d23ec3a701a 100644 --- a/index.rst +++ b/index.rst @@ -877,6 +877,7 @@ Additional Resources intermediate/seq2seq_translation_tutorial beginner/text_sentiment_ngrams_tutorial beginner/translation_transformer + beginner/torchtext_custom_dataset_tutorial .. toctree:: From f7619d73a870b7a25c2a63230a0477eb1978eff1 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 20:06:01 +0530 Subject: [PATCH 15/31] updated Makefile for downloading dataset --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index a01ea69bb50..fe5cd32e31e 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,10 @@ download: wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR) tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/ + # Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py + wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR) + unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/ + docs: make download From 1e4163d91a906483212162d6fdc8e46c8da2d637 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 20:07:27 +0530 Subject: [PATCH 16/31] get dataset from data folder --- beginner_source/torchtext_custom_dataset_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 9e0a3ed1c6e..9281998054e 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -64,7 +64,7 @@ # %% # Now we will load the dataset -FILE_PATH = 'fin.txt' +FILE_PATH = 'data/fin.txt' dataPipe = dp.iter.IterableWrapper([FILE_PATH]) dataPipe = dp.iter.FileOpener(dataPipe, mode='rb') dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) From 1d96ae31d27a068a4720ed2abc1bbc3613c46c3a Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 22:18:09 +0530 Subject: [PATCH 17/31] updated comment --- beginner_source/torchtext_custom_dataset_tutorial.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 9281998054e..b9ae85fde93 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -9,8 +9,7 @@ will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning with Neural Networks `_) but using Torchtext 0.15.0 instead -of a legacy version. +20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version. In this tutorial, we will learn how to: From af728f1893cb7d2bf0ba426b7a9f62922d9d5ced Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 22:57:11 +0530 Subject: [PATCH 18/31] updated function to view results, and added some sources --- beginner_source/torchtext_custom_dataset_tutorial.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index b9ae85fde93..6a4b6991e3b 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -350,7 +350,7 @@ def apply_padding(pair_of_sequences): sourceItoS = sourceVocab.get_itos() targetItoS = targetVocab.get_itos() -def show_some_transformed_senetnces(data_pipe): +def show_some_transformed_sentences(data_pipe): """ Function to show how the senetnces look like after applying all transforms. Here we try to print actual words instead of corresponding index @@ -368,6 +368,11 @@ def show_some_transformed_senetnces(data_pipe): print(f"Source: {source}") print(f"Traget: {target}") break + +show_some_transformed_sentences(dataPipe) # %% -# In the above output we can observe that the shorter sentences are padded with ``. Now, we can -# use this dataPipe while writing our training function. +# In the above output we can observe that the shorter sentences are padded with ``. Now, we\ +# can use this dataPipe while writing our training function. +# +# Some parts of this tutorial was inspired from this article: `Click here: `__. From 519ccbfe51723b2e64c57a123bae265154c66571 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Thu, 4 May 2023 23:09:42 +0530 Subject: [PATCH 19/31] updated typo --- beginner_source/torchtext_custom_dataset_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 6a4b6991e3b..aeb1ef86de4 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -374,5 +374,5 @@ def show_some_transformed_sentences(data_pipe): # In the above output we can observe that the shorter sentences are padded with ``. Now, we\ # can use this dataPipe while writing our training function. # -# Some parts of this tutorial was inspired from this article: `Click here: `__. From 332356be56c643a7ef1821f10e3fa5ef9e09826e Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 01:11:53 +0530 Subject: [PATCH 20/31] fixed hyperlinks --- .../torchtext_custom_dataset_tutorial.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index aeb1ef86de4..d22a41eb303 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -9,7 +9,8 @@ will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning with Neural Networks `_) but without using legacy version. +20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version +of torchtext. In this tutorial, we will learn how to: @@ -21,16 +22,18 @@ Let us assume that we need to prepare a dataset to train a model that can perform English to Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by the `Tatoeba Project `_ which can be downloaded from this link: `Click -Here `__ +Here `__. + +Sentence pairs for other languages can be found in this link: + +Link: `https://www.manythings.org/anki/ `__ """ # %% -# Setup -# ----- +#Setup +#----- # # First, download the dataset, extract the zip, and note the path to the file `fin.txt`. -# The dataset can be downloaded from this link: `Click Here `__ . # # Ensure that following packages are installed: # @@ -79,8 +82,9 @@ # of the tab-delimited file # # Data pipes can be thought of something like a dataset object, on which -# we can perform various operations. Check `this tutorial `_ for more details on data pipes. +# we can perform various operations. +# Check `this tutorial `_ for more details on +# data pipes. # # We can verify if the iterable has the pair of sentences as shown # below: @@ -371,8 +375,9 @@ def show_some_transformed_sentences(data_pipe): show_some_transformed_sentences(dataPipe) # %% -# In the above output we can observe that the shorter sentences are padded with ``. Now, we\ +# In the above output we can observe that the shorter sentences are padded with ``. Now, we # can use this dataPipe while writing our training function. # -# Some parts of this tutorial was inspired from this article: `Click here `__. +# Some parts of this tutorial was inspired from this article: +# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \ +# `__. From 44848ff67d565f4ab5869ea30ea0e656b257e05f Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 01:22:00 +0530 Subject: [PATCH 21/31] changed title and introduction --- beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index d22a41eb303..f4e5350cbd9 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- """ -Preaparing custom text dataset using Torchtext +Pre-process custom text dataset using Torchtext ============================================== **Author**: `Anupam Sharma `_ -This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we -will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence +This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial, +we will pre-process a dataset that can be further utilized to train a sequence-to-sequence model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning with Neural Networks `_) but without using legacy version From e007c60402c0e91dd1014b2bdb7e30bd027d493f Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 01:46:14 +0530 Subject: [PATCH 22/31] fixed indentation issue --- beginner_source/torchtext_custom_dataset_tutorial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index f4e5350cbd9..e66333d8a15 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -30,8 +30,8 @@ """ # %% -#Setup -#----- +# Setup +# ----- # # First, download the dataset, extract the zip, and note the path to the file `fin.txt`. # From 66dd97295781d5f911430f886258147cda3754f4 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 02:11:28 +0530 Subject: [PATCH 23/31] fixed typo --- .../torchtext_custom_dataset_tutorial.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index e66333d8a15..20bea700745 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -241,10 +241,10 @@ def get_transform(vocab): # the transform: tempList = list(dataPipe) -someSetence = tempList[798][0] +someSentence = tempList[798][0] print("Some sentence=", end="") -print(someSetence) -transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence)) +print(someSentence) +transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence)) print("Transformed sentence=", end="") print(transformedSentence) indexToString = sourceVocab.get_itos() @@ -254,7 +254,7 @@ def get_transform(vocab): # %% # In the above code,: # -# * At line 2, we take a source setence from list that we created from dataPipe at line 1 +# * At line 2, we take a source sentence from list that we created from dataPipe at line 1 # * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized # sentence. Note that transforms take list of words and not a sentence. # * At line 8, we get the mapping of index to string and then use it get the transformed @@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe): # In the above output we can observe that the shorter sentences are padded with ``. Now, we # can use this dataPipe while writing our training function. # -# Some parts of this tutorial was inspired from this article: -# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \ -# `__. +# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0 +# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \ +# `__. From 190e4a1fadda8ff3d22922c8cc1bdf11f4642207 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 02:27:45 +0530 Subject: [PATCH 24/31] fixed typo --- beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 20bea700745..d6e6cdcf3d5 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe): # In the above output we can observe that the shorter sentences are padded with ``. Now, we # can use this dataPipe while writing our training function. # -# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0 -# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \ -# `__. +# Some parts of this tutorial was inspired from this article: +# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\ +# `__. From ba5efff2f15c235633f39ea86d06561944dcec1b Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 03:26:57 +0530 Subject: [PATCH 25/31] replaced Finninsh with German as spacy German model is already there in build --- .../torchtext_custom_dataset_tutorial.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index d6e6cdcf3d5..ad749d9fefb 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -20,9 +20,9 @@ * Perform bucket batching Let us assume that we need to prepare a dataset to train a model that can perform English to -Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by +German translation. We will use a tab-delimited German - English sentence pairs provided by the `Tatoeba Project `_ which can be downloaded from this link: `Click -Here `__. +Here `__. Sentence pairs for other languages can be found in this link: @@ -33,7 +33,7 @@ # Setup # ----- # -# First, download the dataset, extract the zip, and note the path to the file `fin.txt`. +# First, download the dataset, extract the zip, and note the path to the file `deu.txt`. # # Ensure that following packages are installed: # @@ -47,10 +47,10 @@ # convert a sentence to list of words. Spacy is a python package used for various Natural # Language Processing (NLP) tasks. # -# Download the English and Finnish models from spacy as shown below: :: +# Download the English and German models from spacy as shown below: :: # # python -m spacy download en_core_web_sm -# python -m spacy download fi_core_news_sm +# python -m spacy download de_core_news_sm # %% @@ -61,12 +61,12 @@ import spacy from torchtext.vocab import build_vocab_from_iterator eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing -fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing +de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing # %% # Now we will load the dataset -FILE_PATH = 'data/fin.txt' +FILE_PATH = 'data/deu.txt' dataPipe = dp.iter.IterableWrapper([FILE_PATH]) dataPipe = dp.iter.FileOpener(dataPipe, mode='rb') dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) @@ -123,23 +123,23 @@ def eng_tokenize(text): """ return [token.text for token in eng.tokenizer(text)] -def fin_tokenize(text): +def de_tokenize(text): """ - Tokenize a Finnish text and returns list of tokens + Tokenize a German text and returns list of tokens """ - return [token.text for token in fin.tokenizer(text)] + return [token.text for token in de.tokenizer(text)] # %% # Above function accepts a text and returns a list of words # as shown below: print(eng_tokenize("Have a good day!!!")) -print(fin_tokenize("Hyvää päivänjatkoa!!!")) +print(de_tokenize("Haben Sie einen guten Tag!!!")) # %% # Building the vocabulary # ----------------------- -# Let us consider an English sentence as the source and a Finnish sentence as the target. +# Let us consider an English sentence as the source and a German sentence as the target. # # Vocabulary can be considered as the set of unique words we have in the dataset. # We will build vocabulary for both our source and target now. @@ -153,11 +153,11 @@ def get_tokens(data_iter, place): tuple of sentences (source and target), `place` parameters defines for which index to return the tokens for. `place=0` for source and `place=1` for target """ - for english, finnish in data_iter: + for english, german in data_iter: if place == 0: yield eng_tokenize(english) else: - yield fin_tokenize(finnish) + yield de_tokenize(german) # %% # Now, we will build vocabulary for source: @@ -270,7 +270,7 @@ def apply_transform(sequence_pair): return ( get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])), - get_transform(targetVocab)(fin_tokenize(sequence_pair[1])) + get_transform(targetVocab)(de_tokenize(sequence_pair[1])) ) dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator tempList = list(dataPipe) From 8b105b6f905361ff7024ed1462e060e3571311f1 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 03:30:18 +0530 Subject: [PATCH 26/31] fixed issue in title --- beginner_source/torchtext_custom_dataset_tutorial.py | 2 +- index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index ad749d9fefb..e4ec3b48100 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Pre-process custom text dataset using Torchtext -============================================== +=============================================== **Author**: `Anupam Sharma `_ diff --git a/index.rst b/index.rst index d23ec3a701a..6e6d687d0c1 100644 --- a/index.rst +++ b/index.rst @@ -265,7 +265,7 @@ What's new in PyTorch tutorials? :tags: Text .. customcarditem:: - :header: Preaparing custom text dataset using Torchtext + :header: Pre-process custom text dataset using Torchtext :card_description: Learn how to use torchtext to prepare a custom dataset :image: _static/img/thumbnails/cropped/torch_text_logo.png :link: beginner/torchtext_custom_dataset_tutorial.html From 409b29d046dc904d2e5f1905e8f071276e8250c2 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Fri, 5 May 2023 03:31:38 +0530 Subject: [PATCH 27/31] use another dataset --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fe5cd32e31e..ed0ade00465 100644 --- a/Makefile +++ b/Makefile @@ -107,8 +107,8 @@ download: tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/ # Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py - wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR) - unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/ + wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR) + unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/ docs: From 8925bcba4e9c9282ecf7eae320e519f8a4b8f32f Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Tue, 16 May 2023 18:43:05 +0530 Subject: [PATCH 28/31] addressed review comments for PR #2307 --- .../torchtext_custom_dataset_tutorial.py | 164 +++++++++--------- 1 file changed, 81 insertions(+), 83 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index e4ec3b48100..1439615050b 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -21,12 +21,11 @@ Let us assume that we need to prepare a dataset to train a model that can perform English to German translation. We will use a tab-delimited German - English sentence pairs provided by -the `Tatoeba Project `_ which can be downloaded from this link: `Click -Here `__. +the `Tatoeba Project `_ which can be downloaded from +`this link `__. -Sentence pairs for other languages can be found in this link: - -Link: `https://www.manythings.org/anki/ `__ +Sentence pairs for other languages can be found in `this link `\ +__. """ # %% @@ -37,11 +36,11 @@ # # Ensure that following packages are installed: # -# * `Torchdata 0.6.0 `_ (Installation instructions: `C\ -# lick here `__) -# * `Torchtext 0.15.0 `_ (Installation instructions:\ -# `Click here `__) -# * Spacy (Docs: `Click here `__) +# * `Torchdata 0.6.0 `_ (`Installation instructions \ +# `__) +# * `Torchtext 0.15.0 `_ (`Installation instructions \ +# `__) +# * `Spacy `__ # # Here, we are using `Spacy` to tokenize text. In simple words tokenization means to # convert a sentence to list of words. Spacy is a python package used for various Natural @@ -67,9 +66,9 @@ # Now we will load the dataset FILE_PATH = 'data/deu.txt' -dataPipe = dp.iter.IterableWrapper([FILE_PATH]) -dataPipe = dp.iter.FileOpener(dataPipe, mode='rb') -dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) +data_pipe = dp.iter.IterableWrapper([FILE_PATH]) +data_pipe = dp.iter.FileOpener(data_pipe, mode='rb') +data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) # %% # In the above code block, we are doing following things: @@ -81,15 +80,15 @@ # again returns an iterable of tuples representing each rows # of the tab-delimited file # -# Data pipes can be thought of something like a dataset object, on which +# DataPipes can be thought of something like a dataset object, on which # we can perform various operations. # Check `this tutorial `_ for more details on -# data pipes. +# DataPipes. # # We can verify if the iterable has the pair of sentences as shown # below: -for sample in dataPipe: +for sample in data_pipe: print(sample) break @@ -97,35 +96,35 @@ # Note that we also have attribution details along with pair of sentences. We will # write a small function to remove the attribution details: -def remove_attribution(row): +def removeAttribution(row): """ Function to keep the first two elements in a tuple """ return row[:2] -dataPipe = dataPipe.map(remove_attribution) +data_pipe = data_pipe.map(removeAttribution) # %% -# The `map` function at line 2 in above code block can be used to apply some function -# on each elements of data pipe. Now, we can verify that the data pipe only contains +# The `map` function at line 6 in above code block can be used to apply some function +# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains # pair of sentences. -for sample in dataPipe: +for sample in data_pipe: print(sample) break # %% # Now, let us define few functions to perform tokenization: -def eng_tokenize(text): +def engTokenize(text): """ - Tokenize an English text and returns list of tokens + Tokenize an English text and return a list of tokens """ return [token.text for token in eng.tokenizer(text)] -def de_tokenize(text): +def deTokenize(text): """ - Tokenize a German text and returns list of tokens + Tokenize a German text and return a list of tokens """ return [token.text for token in de.tokenizer(text)] @@ -133,8 +132,8 @@ def de_tokenize(text): # Above function accepts a text and returns a list of words # as shown below: -print(eng_tokenize("Have a good day!!!")) -print(de_tokenize("Haben Sie einen guten Tag!!!")) +print(engTokenize("Have a good day!!!")) +print(deTokenize("Haben Sie einen guten Tag!!!")) # %% # Building the vocabulary @@ -145,9 +144,9 @@ def de_tokenize(text): # We will build vocabulary for both our source and target now. # # Let us define a function to get tokens from elements of tuples in the iterator. -# The comments within the function specifies the need and working of it: -def get_tokens(data_iter, place): + +def getTokens(data_iter, place): """ Function to yield tokens from an iterator. Since, our iterator contains tuple of sentences (source and target), `place` parameters defines for which @@ -155,25 +154,25 @@ def get_tokens(data_iter, place): """ for english, german in data_iter: if place == 0: - yield eng_tokenize(english) + yield engTokenize(english) else: - yield de_tokenize(german) + yield deTokenize(german) # %% # Now, we will build vocabulary for source: -sourceVocab = build_vocab_from_iterator( - get_tokens(dataPipe,0), +source_vocab = build_vocab_from_iterator( + getTokens(data_pipe,0), min_freq=2, specials= ['', '', '', ''], special_first=True ) -sourceVocab.set_default_index(sourceVocab['']) +source_vocab.set_default_index(source_vocab['']) # %% # The code above, builds the vocabulary from the iterator. In the above code block: # -# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for +# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for # source sentences. # * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs # less than 2 times. @@ -194,23 +193,23 @@ def get_tokens(data_iter, place): # # Similarly, we will build vocabulary for target sentences: -targetVocab = build_vocab_from_iterator( - get_tokens(dataPipe,1), +target_vocab = build_vocab_from_iterator( + getTokens(data_pipe,1), min_freq=2, specials= ['', '', '', ''], special_first=True ) -targetVocab.set_default_index(targetVocab['']) +target_vocab.set_default_index(target_vocab['']) # %% # Note that the example above shows how can we add special tokens to our vocabulary. The # special tokens may change based on the requirements. # # Now, we can verify that special tokens are placed at the beginning and then other words. -# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on +# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on # vocabulary. -print(sourceVocab.get_itos()[:9]) +print(source_vocab.get_itos()[:9]) # %% # Numericalize sentences using vocabulary @@ -218,7 +217,7 @@ def get_tokens(data_iter, place): # After building the vocabulary, we need to convert our sentences to corresponding indices. # Let us define some functions for this: -def get_transform(vocab): +def getTransform(vocab): """ Create transforms based on given vocabulary. The returned transform is applied to sequence of tokens. @@ -237,55 +236,55 @@ def get_transform(vocab): # %% # Now, let us see how to use the above function. The function returns an object of `Transforms` -# which we will use on our sentence. Let us take a random sentence and check the working of -# the transform: +# which we will use on our sentence. Let us take a random sentence and check how the transform +# works. -tempList = list(dataPipe) -someSentence = tempList[798][0] +temp_list = list(data_pipe) +some_sentence = temp_list[798][0] print("Some sentence=", end="") -print(someSentence) -transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence)) +print(some_sentence) +transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence)) print("Transformed sentence=", end="") -print(transformedSentence) -indexToString = sourceVocab.get_itos() -for index in transformedSentence: - print(indexToString[index], end=" ") +print(transformed_sentence) +index_to_string = source_vocab.get_itos() +for index in transformed_sentence: + print(index_to_string[index], end=" ") # %% # In the above code,: # -# * At line 2, we take a source sentence from list that we created from dataPipe at line 1 -# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized -# sentence. Note that transforms take list of words and not a sentence. -# * At line 8, we get the mapping of index to string and then use it get the transformed -# sentence +# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1 +# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized +# sentence. Note that transforms take list of words and not a sentence. +# * At line 8, we get the mapping of index to string and then use it get the transformed +# sentence # -# Now we will use functions of `dataPipe` to apply transform to all our sentences. +# Now we will use DataPipe functions to apply transform to all our sentences. # Let us define some more functions for this. -def apply_transform(sequence_pair): +def applyTransform(sequence_pair): """ Apply transforms to sequence of tokens in a sequence pair """ return ( - get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])), - get_transform(targetVocab)(de_tokenize(sequence_pair[1])) + getTransform(source_vocab)(engTokenize(sequence_pair[0])), + getTransform(target_vocab)(deTokenize(sequence_pair[1])) ) -dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator -tempList = list(dataPipe) -print(tempList[0]) +data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator +temp_list = list(data_pipe) +print(temp_list[0]) # %% # Make batches (with bucket batch) # -------------------------------- # Generally, we train models in batches. While working for sequence to sequence models, it is # recommended to keep the length of sequences in a batch similar. For that we will use -# `bucketbatch` function of `dataPipe`. +# `bucketbatch` function of `data_pipe`. # # Let us define some functions that will be used by the `bucketbatch` function. -def sort_bucket(bucket): +def sortBucket(bucket): """ Function to sort a given bucket. Here, we want to sort based on the length of source and target sequence. @@ -295,9 +294,9 @@ def sort_bucket(bucket): # %% # Now, we will apply the `bucketbatch` function: -dataPipe = dataPipe.bucketbatch( +data_pipe = data_pipe.bucketbatch( batch_size = 4, batch_num=5, bucket_num=1, - use_in_batch_shuffle=False, sort_key=sort_bucket + use_in_batch_shuffle=False, sort_key=sortBucket ) # %% @@ -310,14 +309,14 @@ def sort_bucket(bucket): # # Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`. # Generally, while training a model, we predict on a batch of `X` and compare the result with `y`. -# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: +# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: -print(list(dataPipe)[0]) +print(list(data_pipe)[0]) # %% # So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`. # For this we will write a small function: -def separate_source_target(sequence_pairs): +def separateSourceTarget(sequence_pairs): """ input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]` output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))` @@ -326,8 +325,8 @@ def separate_source_target(sequence_pairs): return sources,targets ## Apply the function to each element in the iterator -dataPipe = dataPipe.map(separate_source_target) -print(list(dataPipe)[0]) +data_pipe = data_pipe.map(separateSourceTarget) +print(list(data_pipe)[0]) # %% # Now, we have the data as desired. @@ -337,7 +336,7 @@ def separate_source_target(sequence_pairs): # As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to # make all the sequences in a batch of equal length. We can perform padding as follows: -def apply_padding(pair_of_sequences): +def applyPadding(pair_of_sequences): """ Convert sequnces to tensors and apply padding """ @@ -345,16 +344,16 @@ def apply_padding(pair_of_sequences): ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies # padding. Here, `0` is passed to the constructor to specify the index of the `` token in the # vocabulary. -dataPipe = dataPipe.map(apply_padding) +data_pipe = data_pipe.map(applyPadding) # %% # Now, we can use the index to string mapping to see how the sequence would look with tokens # instead of indices: -sourceItoS = sourceVocab.get_itos() -targetItoS = targetVocab.get_itos() +source_index_to_string = source_vocab.get_itos() +target_index_to_string = target_vocab.get_itos() -def show_some_transformed_sentences(data_pipe): +def showSomeTransformedSentences(data_pipe): """ Function to show how the senetnces look like after applying all transforms. Here we try to print actual words instead of corresponding index @@ -365,19 +364,18 @@ def show_some_transformed_sentences(data_pipe): for i in range(4): source = "" for token in sources[i]: - source += " " + sourceItoS[token] + source += " " + source_index_to_string[token] target = "" for token in targets[i]: - target += " " + targetItoS[token] + target += " " + target_index_to_string[token] print(f"Source: {source}") print(f"Traget: {target}") break -show_some_transformed_sentences(dataPipe) +showSomeTransformedSentences(data_pipe) # %% # In the above output we can observe that the shorter sentences are padded with ``. Now, we -# can use this dataPipe while writing our training function. +# can use `data_pipe` while writing our training function. # -# Some parts of this tutorial was inspired from this article: -# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\ +# Some parts of this tutorial was inspired from `this article # `__. From 9f71e3fa81569c001c2284600e8295c2cfdd9848 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Tue, 16 May 2023 22:45:13 +0530 Subject: [PATCH 29/31] corrected spelling mistakes --- .../torchtext_custom_dataset_tutorial.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 1439615050b..05d3c0b8625 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- """ -Pre-process custom text dataset using Torchtext +Preprocess custom text dataset using Torchtext =============================================== **Author**: `Anupam Sharma `_ This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial, -we will pre-process a dataset that can be further utilized to train a sequence-to-sequence +we will preprocess a dataset that can be further utilized to train a sequence-to-sequence model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning with Neural Networks `_) but without using legacy version @@ -46,7 +46,7 @@ # convert a sentence to list of words. Spacy is a python package used for various Natural # Language Processing (NLP) tasks. # -# Download the English and German models from spacy as shown below: :: +# Download the English and German models from Spacy as shown below: :: # # python -m spacy download en_core_web_sm # python -m spacy download de_core_news_sm @@ -179,7 +179,7 @@ def getTokens(data_iter, place): # * At line 4, we specify some special tokens: # # * `` for start of sentence -# * `` for end of senetence +# * `` for end of sentence # * `` for unknown words. An example of unknown word is the one skipped because of # `min_freq=2`. # * `` is the padding token. While training, a model we mostly train in batches. In a @@ -189,7 +189,7 @@ def getTokens(data_iter, place): # * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1, # `` index 2, and will get index 3 in the vocabulary. # * At line 7, we set default index as index of ``. That means if some word is not in -# vocbulary, we will use `` instead of that unknown word. +# vocabulary, we will use `` instead of that unknown word. # # Similarly, we will build vocabulary for target sentences: @@ -338,7 +338,7 @@ def separateSourceTarget(sequence_pairs): def applyPadding(pair_of_sequences): """ - Convert sequnces to tensors and apply padding + Convert sequences to tensors and apply padding """ return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1]))) ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies @@ -355,7 +355,7 @@ def applyPadding(pair_of_sequences): def showSomeTransformedSentences(data_pipe): """ - Function to show how the senetnces look like after applying all transforms. + Function to show how the sentences look like after applying all transforms. Here we try to print actual words instead of corresponding index """ for sources,targets in data_pipe: From 12b644072d9f720123ae27639f8eb9adc3cbbee5 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Wed, 17 May 2023 00:57:40 +0530 Subject: [PATCH 30/31] followed pyspelling's configuration for the shell commands --- .../torchtext_custom_dataset_tutorial.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py index 05d3c0b8625..9875d8aa43a 100644 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ b/beginner_source/torchtext_custom_dataset_tutorial.py @@ -46,10 +46,13 @@ # convert a sentence to list of words. Spacy is a python package used for various Natural # Language Processing (NLP) tasks. # -# Download the English and German models from Spacy as shown below: :: +# Download the English and German models from Spacy as shown below: +# +# .. code-block:: shell +# +# python -m spacy download en_core_web_sm +# python -m spacy download de_core_news_sm # -# python -m spacy download en_core_web_sm -# python -m spacy download de_core_news_sm # %% @@ -59,8 +62,8 @@ import torchtext.transforms as T import spacy from torchtext.vocab import build_vocab_from_iterator -eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing -de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing +eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text +de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text # %% # Now we will load the dataset From f48bc259ccc54adc7413ec985066606a62cb66e5 Mon Sep 17 00:00:00 2001 From: anp-scp <29808870+anp-scp@users.noreply.github.com> Date: Wed, 17 May 2023 00:59:24 +0530 Subject: [PATCH 31/31] added words used in beginner_source/torchtext_custom_dataset_tutorial.py --- en-wordlist.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/en-wordlist.txt b/en-wordlist.txt index fdf5df67d8d..20b63dadc2d 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -30,6 +30,8 @@ DDP DDQN DNN DQN +DataPipe +DataPipes DataLoaders DeepMind DeiT @@ -126,6 +128,7 @@ SciPy Sequentials Sigmoid SoTA +Spacy TPU TensorBoard TextVQA @@ -345,6 +348,7 @@ timestep timesteps tokenization tokenize +tokenized tokenizer tokenizes tooltip