From f24a95983110a25d1c4237e95b90907ff0ea3589 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:16:55 +0530
Subject: [PATCH 01/31] added intro
---
.../torchtext_custom_dataset_tutorial.py | 27 +++++++++++++++++++
1 file changed, 27 insertions(+)
create mode 100644 beginner_source/torchtext_custom_dataset_tutorial.py
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
new file mode 100644
index 00000000000..daa6f39a28c
--- /dev/null
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Preaparing custom text dataset using Torchtext
+==============================================
+
+**Author**: `Anupam Sharma `_
+
+This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we
+will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence
+model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
+with Neural Networks `_) but using Torchtext 0.15.0 instead
+of a legacy version.
+
+In this tutorial, we will learn how to:
+
+* Read a dataset
+* Tokenize sentence
+* Apply transforms to sentence
+* Perform bucket batching
+
+Let us assume that we need to prepare a dataset to train a model that can perform English to
+Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
+the `Tatoeba Project `_ which can be downloaded from this link: `Click
+Here `__
+"""
+
From a0ac5d5a93fd3b149dd1fe3aecebf4681d5ee7e5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:18:10 +0530
Subject: [PATCH 02/31] added setup section
---
.../torchtext_custom_dataset_tutorial.py | 24 +++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index daa6f39a28c..62b7735ba47 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -25,3 +25,27 @@
Here `__
"""
+# %%
+# Setup
+# -----
+#
+# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
+# The dataset can be downloaded from this link: `Click Here `__ .
+#
+# Ensure that following packages are installed:
+#
+# * `Torchdata 0.6.0 `_ (Installation instructions: `C\
+# lick here `__)
+# * `Torchtext 0.15.0 `_ (Installation instructions:\
+# `Click here `__)
+# * Spacy (Docs: `Click here `__)
+#
+# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to
+# convert a sentence to list of words. Spacy is a python package used for various Natural
+# Language Processing (NLP) tasks.
+#
+# Download the English and Finnish models from spacy as shown below: ::
+#
+# python -m spacy download en_core_web_sm
+# python -m spacy download fi_core_news_sm
From d1a010b74f56928a8853daa86b3bae3c44f91da0 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:20:19 +0530
Subject: [PATCH 03/31] import packages and read dataset
---
.../torchtext_custom_dataset_tutorial.py | 40 +++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 62b7735ba47..14f88da3511 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -49,3 +49,43 @@
#
# python -m spacy download en_core_web_sm
# python -m spacy download fi_core_news_sm
+
+
+# %%
+# Let us start by importing required modules:
+
+import torchdata.datapipes as dp
+import torchtext.transforms as T
+import spacy
+from torchtext.vocab import build_vocab_from_iterator
+eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
+fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing
+
+# %%
+# Now we will load the dataset
+
+FILE_PATH = 'fin.txt'
+dataPipe = dp.iter.IterableWrapper([FILE_PATH])
+dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
+dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
+
+# %%
+# In the above code block, we are doing following things:
+#
+# 1. At line 2, we are creating an iterable of filenames
+# 2. At line 3, we pass the iterable to `FileOpener` which then
+# opens the file in read mode
+# 3. At line 4, we call a function to parse the file, which
+# again returns an iterable of tuples representing each rows
+# of the tab-delimited file
+#
+# Data pipes can be thought of something like a dataset object, on which
+# we can perform various operations. Check `this tutorial `_ for more details on data pipes.
+#
+# We can verify if the iterable has the pair of sentences as shown
+# below:
+
+for sample in dataPipe:
+ print(sample)
+ break
From b8d1dfa27fc63f00ac03007c681b258b3956a891 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:21:36 +0530
Subject: [PATCH 04/31] remove attributions from dataset
---
.../torchtext_custom_dataset_tutorial.py | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 14f88da3511..9cf9441c640 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -89,3 +89,25 @@
for sample in dataPipe:
print(sample)
break
+
+# %%
+# Note that we also have attribution details along with pair of sentences. We will
+# write a small function to remove the attribution details:
+
+def remove_attribution(row):
+ """
+ Function to keep the first two elements in a tuple
+ """
+ return row[:2]
+dataPipe = dataPipe.map(remove_attribution)
+
+# %%
+# The `map` function at line 2 in above code block can be used to apply some function
+# on each elements of data pipe. Now, we can verify that the data pipe only contains
+# pair of sentences.
+
+
+for sample in dataPipe:
+ print(sample)
+ break
+
From da7d346bcc238b4b0389d30eb0a3272253a53f4f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:22:46 +0530
Subject: [PATCH 05/31] added functions for tokenization
---
.../torchtext_custom_dataset_tutorial.py | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9cf9441c640..33da9be38ad 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -111,3 +111,25 @@ def remove_attribution(row):
print(sample)
break
+# %%
+# Now, let us define few functions to perform tokenization:
+
+def eng_tokenize(text):
+ """
+ Tokenize an English text and returns list of tokens
+ """
+ return [token.text for token in eng.tokenizer(text)]
+
+def fin_tokenize(text):
+ """
+ Tokenize a Finnish text and returns list of tokens
+ """
+ return [token.text for token in fin.tokenizer(text)]
+
+# %%
+# Above function accepts a text and returns a list of words
+# as shown below:
+
+print(eng_tokenize("Have a good day!!!"))
+print(fin_tokenize("Hyvää päivänjatkoa!!!"))
+
From 86f56e10f9b8b5949b3a78eb3009270a3e786015 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:23:48 +0530
Subject: [PATCH 06/31] building the vocabulary
---
.../torchtext_custom_dataset_tutorial.py | 66 +++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 33da9be38ad..5aa6f8d8bed 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -133,3 +133,69 @@ def fin_tokenize(text):
print(eng_tokenize("Have a good day!!!"))
print(fin_tokenize("Hyvää päivänjatkoa!!!"))
+# %%
+# Building the vocabulary
+# -----------------------
+# Let us consider an English sentence as the source and a Finnish sentence as the target.
+#
+# Vocabulary can be considered as the set of unique words we have in the dataset.
+# We will build vocabulary for both our source and target now.
+#
+# Let us define a function to get tokens from elements of tuples in the iterator.
+# The comments within the function specifies the need and working of it:
+
+def get_tokens(data_iter, place):
+ """
+ Function to yield tokens from an iterator. Since, our iterator contains
+ tuple of sentences (source and target), `place` parameters defines for which
+ index to return the tokens for. `place=0` for source and `place=1` for target
+ """
+ for english, finnish in data_iter:
+ if place == 0:
+ yield eng_tokenize(english)
+ else:
+ yield fin_tokenize(finnish)
+
+# %%
+# Now, we will build vocabulary for source:
+
+sourceVocab = build_vocab_from_iterator(
+ get_tokens(dataPipe,0),
+ min_freq=2,
+ specials= ['', '', '', ''],
+ special_first=True
+)
+sourceVocab.set_default_index(sourceVocab[''])
+
+# %%
+# The code above, builds the vocabulary from the iterator. In the above code block:
+#
+# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for
+# source sentences.
+# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs
+# less than 2 times.
+# * At line 4, we specify some special tokens:
+#
+# * `` for start of sentence
+# * `` for end of senetence
+# * `` for unknown words. An example of unknown word is the one skipped because of
+# `min_freq=2`.
+# * `` is the padding token. While training, a model we mostly train in batches. In a
+# batch, there can be sentences of different length. So, we pad the shorter sentences with
+# `` token to make length of all sequences in the batch equal.
+#
+# * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1,
+# `` index 2, and will get index 3 in the vocabulary.
+# * At line 7, we set default index as index of ``. That means if some word is not in
+# vocbulary, we will use `` instead of that unknown word.
+#
+# Similarly, we will build vocabulary for target sentences:
+
+targetVocab = build_vocab_from_iterator(
+ get_tokens(dataPipe,1),
+ min_freq=2,
+ specials= ['', '', '', ''],
+ special_first=True
+)
+targetVocab.set_default_index(targetVocab[''])
+
From 10e01a819b47f9dd93a5c6bfa27a9442a230776a Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:25:04 +0530
Subject: [PATCH 07/31] added some comments
---
beginner_source/torchtext_custom_dataset_tutorial.py | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 5aa6f8d8bed..004481403d9 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -199,3 +199,13 @@ def get_tokens(data_iter, place):
)
targetVocab.set_default_index(targetVocab[''])
+# %%
+# Note that the example above shows how can we add special tokens to our vocabulary. The
+# special tokens may change based on the requirements.
+#
+# Now, we can verify that special tokens are placed at the beginning and then other words.
+# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on
+# vocabulary.
+
+print(sourceVocab.get_itos()[:9])
+
From 7dfa29f3db08949e2dd9400e3310fe342b38f29d Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:25:58 +0530
Subject: [PATCH 08/31] Numericalize sentences using vocabulary
---
.../torchtext_custom_dataset_tutorial.py | 64 +++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 004481403d9..beab7ee8641 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -209,3 +209,67 @@ def get_tokens(data_iter, place):
print(sourceVocab.get_itos()[:9])
+# %%
+# Numericalize sentences using vocabulary
+# ---------------------------------------
+# After building the vocabulary, we need to convert our sentences to corresponding indices.
+# Let us define some functions for this:
+
+def get_transform(vocab):
+ """
+ Create transforms based on given vocabulary. The returned transform is applied to sequence
+ of tokens.
+ """
+ text_tranform = T.Sequential(
+ ## converts the sentences to indices based on given vocabulary
+ T.VocabTransform(vocab=vocab),
+ ## Add at beginning of each sentence. 1 because the index for in vocabulary is
+ # 1 as seen in previous section
+ T.AddToken(1, begin=True),
+ ## Add at beginning of each sentence. 2 because the index for in vocabulary is
+ # 2 as seen in previous section
+ T.AddToken(2, begin=False)
+ )
+ return text_tranform
+
+# %%
+# Now, let us see how to use the above function. The function returns an object of `Transforms`
+# which we will use on our sentence. Let us take a random sentence and check the working of
+# the transform:
+
+tempList = list(dataPipe)
+someSetence = tempList[798][0]
+print("Some sentence=", end="")
+print(someSetence)
+transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence))
+print("Transformed sentence=", end="")
+print(transformedSentence)
+indexToString = sourceVocab.get_itos()
+for index in transformedSentence:
+ print(indexToString[index], end=" ")
+
+# %%
+# In the above code,:
+#
+# * At line 2, we take a source setence from list that we created from dataPipe at line 1
+# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
+# sentence. Note that transforms take list of words and not a sentence.
+# * At line 8, we get the mapping of index to string and then use it get the transformed
+# sentence
+#
+# Now we will use functions of `dataPipe` to apply transform to all our sentences.
+# Let us define some more functions for this.
+
+def apply_transform(sequence_pair):
+ """
+ Apply transforms to sequence of tokens in a sequence pair
+ """
+
+ return (
+ get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
+ get_transform(targetVocab)(fin_tokenize(sequence_pair[1]))
+ )
+dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
+tempList = list(dataPipe)
+print(tempList[0])
+
From d9dea745c6261c3a87d2ab3cb6e1795f68a73ac7 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:27:06 +0530
Subject: [PATCH 09/31] bucket batching
---
.../torchtext_custom_dataset_tutorial.py | 53 +++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index beab7ee8641..1e7314d1013 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -273,3 +273,56 @@ def apply_transform(sequence_pair):
tempList = list(dataPipe)
print(tempList[0])
+# %%
+# Make batches (with bucket batch)
+# --------------------------------
+# Generally, we train models in batches. While working for sequence to sequence models, it is
+# recommended to keep the length of sequences in a batch similar. For that we will use
+# `bucketbatch` function of `dataPipe`.
+#
+# Let us define some functions that will be used by the `bucketbatch` function.
+
+def sort_bucket(bucket):
+ """
+ Function to sort a given bucket. Here, we want to sort based on the length of
+ source and target sequence.
+ """
+ return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))
+
+# %%
+# Now, we will apply the `bucketbatch` function:
+
+dataPipe = dataPipe.bucketbatch(
+ batch_size = 4, batch_num=5, bucket_num=1,
+ use_in_batch_shuffle=False, sort_key=sort_bucket
+)
+
+# %%
+# In the above code block:
+#
+# * We keep batch size = 4.
+# * `batch_num` is the number of batches to keep in a bucket
+# * `bucket_num` is the number of buckets to keep in a pool for shuffling
+# * `sort_key` specifies the function that takes a bucket and sorts it
+#
+# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`.
+# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`.
+# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
+
+print(list(dataPipe)[0])
+# %%
+# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`.
+# For this we will write a small function:
+
+def separate_source_target(sequence_pairs):
+ """
+ input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
+ output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
+ """
+ sources,targets = zip(*sequence_pairs)
+ return sources,targets
+
+## Apply the function to each element in the iterator
+dataPipe = dataPipe.map(separate_source_target)
+print(list(dataPipe)[0])
+
From 2dd7f197fd9cdc5e30b6cee3be0250d8cf284da9 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:27:57 +0530
Subject: [PATCH 10/31] applied padding
---
.../torchtext_custom_dataset_tutorial.py | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 1e7314d1013..a6d42ea999f 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -326,3 +326,21 @@ def separate_source_target(sequence_pairs):
dataPipe = dataPipe.map(separate_source_target)
print(list(dataPipe)[0])
+# %%
+# Now, we have the data as desired.
+#
+# Padding
+# -------
+# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to
+# make all the sequences in a batch of equal length. We can perform padding as follows:
+
+def apply_padding(pair_of_sequences):
+ """
+ Convert sequnces to tensors and apply padding
+ """
+ return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
+## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
+# padding. Here, `0` is passed to the constructor to specify the index of the `` token in the
+# vocabulary.
+dataPipe = dataPipe.map(apply_padding)
+
From 1fec4b5aa0cc4b6565417420e76e6c7c02e05735 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:31:18 +0530
Subject: [PATCH 11/31] view the final result
---
.../torchtext_custom_dataset_tutorial.py | 28 +++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index a6d42ea999f..9e0a3ed1c6e 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -344,3 +344,31 @@ def apply_padding(pair_of_sequences):
# vocabulary.
dataPipe = dataPipe.map(apply_padding)
+# %%
+# Now, we can use the index to string mapping to see how the sequence would look with tokens
+# instead of indices:
+
+sourceItoS = sourceVocab.get_itos()
+targetItoS = targetVocab.get_itos()
+
+def show_some_transformed_senetnces(data_pipe):
+ """
+ Function to show how the senetnces look like after applying all transforms.
+ Here we try to print actual words instead of corresponding index
+ """
+ for sources,targets in data_pipe:
+ if sources[0][-1] != 0:
+ continue # Just to visualize padding of shorter sentences
+ for i in range(4):
+ source = ""
+ for token in sources[i]:
+ source += " " + sourceItoS[token]
+ target = ""
+ for token in targets[i]:
+ target += " " + targetItoS[token]
+ print(f"Source: {source}")
+ print(f"Traget: {target}")
+ break
+# %%
+# In the above output we can observe that the shorter sentences are padded with ``. Now, we can
+# use this dataPipe while writing our training function.
From 158c5fd2ad40028dea67a48152ffbe6927729c16 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:39:16 +0530
Subject: [PATCH 12/31] added torchtext logo
---
.../img/thumbnails/cropped/torch_text_logo.png | Bin 0 -> 8545 bytes
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 _static/img/thumbnails/cropped/torch_text_logo.png
diff --git a/_static/img/thumbnails/cropped/torch_text_logo.png b/_static/img/thumbnails/cropped/torch_text_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..3fe736d60e282d080ac3165e3a5aad3c69bcc934
GIT binary patch
literal 8545
zcmeHtc~n!$mw!M34GpNYD1tzD2hxoo5kzDW1UC}4Hj6AG1X+}@8e}JcqSy)wz6gX}
zW$PCZ5M|%NEsbn#g2-Y-q9_=`A|MINRQ%4&`OTU0n}6oaobMm=IIrGIRo%K(x2o>_
zeCl!icl#4^$X!ST0wIUBK8i;mB%Q?fCTWn<;=>pLkB#Rn?JN<9hbgiv9#Y`9hNm^&
z4uOc$LLlO=AP_>375@o=2*n@}lVk+Kgn>Zp2+nCZX$A_U&)S?gidYx_-EA(RgB+QQ
zSi55~6B~b&S2$=*31lG<>gTXWEr}6B{88VK(-WCm?^IkL2ca#&${))!IlmlEM)^fz5l=!%xsuj5
zc=r%nYx9M3GZO;XyXi}!+K|P9q*vw^
zlty;0{QhYERO?KO=d~x{kz$mH~at3($qvRWtoshyUoA6t
zqNVY0kOO_?VPkVTl05au(
zMJ?Mw7Mjjobw;v-o{~!QWdfTE^(IL^sYu5z_`{qctbGcf#vF#@L(4^vOtCH2x+><@
z3sipDnS-r#C+#cE8f1x=wJeWDEa`ENmulN7dedzl9TjJ1*ilhbew5gK`Owc~2~`~%
zm^lDXFZx4E^q3r;BQ`bz>5>iOlqzL5>cl6U`;Ejh$>Ck3B(uhdMWx(jGNWNAn8xo|
z_HcFHWVhJ~A;axw|Nb_O=sgvz{kv{Sxqg$1!K1ssNyxr?r&1`avPT(bA_-HY&lBNi
z->$L7=c|a1-hB|uxpOzRLP!^`-Bl(h`P}nS>#I66=f3!OsY)mLl9~*YJf@FksNonD
z())#MsVJ$`cGejyRjLU8HPFo7eif^`W6>t@myoLOjU~Z?E|Yv0ZL^`~px?&j3c6XB
zGf-p+opJ(94mDYbs?3~SB&2-wdT{2T#UK0RXRN^3o^pD-Ofo1jtr@dk-%e7lbVgAW
zj=YoFznQk*U=D9##@f$tjPbbp#=(51vJ&7cxcL8xUc6ACSij%&62+AJ>6{9ZsD7tOM%
z9{8Kn1;olBYonI-9G!2`pg{)%Yr9B`Y+eG9F2XTqX?JF&`#*ZMi#y;x@jDbQ?m9uD
zl&c}6FdbwWU~SBVejg8^LPY^@a|L_GyDm3yxiX*b4W}Lx_o7KCcR!ASJaa<-t%fk_
zCF}ADU;?)%*V!#s&*5#Z4J!w2p=nAgp|(~cBKCE`%FaPr%b0N*QnY7tg{HH6V{q(F
zN2RJZvCuQBw%4i-Cp+}l6d$(NGFwn@J8{wh++v+e$C2zS%p`kBg63tRn4%l%2#
zpopYXa(UnyoEmy?T<(5FiB)=wj?%4oskTxRh-V5*e^XR;h<3jrci;JUFu)3mCNO(R
z2d6pt2&dBY06wJ3e@nO3mP*8V?^IqjYJy-VpodfiK9TO%*&-pk*z<|gx>432!&i*J
zmO!dD3*NdVrLLjZKyy1Kl^z@6>>jsd2Z~Z(3X(KM=lXW9=D_1{;%wxjJ50Cc+d~>c
zw`X>O0x%U(K#c^NY#D8e9e`r8T$GfH-S&?;3xh*ZEF-9ij?|vt)#C^?<$4*CI4-R5
zkSd`>QLx5lr@ptC?Uic`r1R<7pqmx7caZ96s3`-fbq(JYK$AM3r-L@mMLM3qlS&t+
zJy&ze7Et@pNeM6XXy=0%yi$}pdO3h8@XFh>^~(d6CeM=MJA72M
z_IX;X$XWDk$&%UOmvCMlB_TUBjBUomLOct6f|A0Cw)a!~Ldh2&h}P~lS5p)-x$$Yl
z^D7#pZj&sbgSEO&H|zc3ZIySMPf|XQ`#_}^ihk_%0GobonmW1sD2=kwB5UZD4|}xS
zXRSj>rNHxi;<2SbZ7(_ey{guZ;6c3|!ljNT#Y%*ajm4ky`0zNx?N-x;H3gA-=0%Os
zzg3Sc(~0j)axRRE7N#e{LVuF!Xj=P4>foP33(|71L;teZVImPyp~Q^{iGo$P$UnBWlE}whW8)3Dloau>O
zMPM5q*Gw`7%lTzFpQz{V&cbQOtn9De-Q6TzR#;v}s}(8Rdyk}4=S-eyGUY2?ar{YK
zHMW5FAbG<8W}#7ZQ9CE+MNALN1-KqM{ClWH_0hrgRjnef2@1T~c)43bmMm}PVLxEc
zl$oL~t+iOjFm}bwM14G$-1(=eAR1ziTQ3b1Jyo?PycTZqJBf8BX~Nxc_ZxFL&zAVa^}#e^
zE2pAyd_1`Vx>ElOn2pwx+R3&(tX+j!V;0sRB7aM8{##FWabmS=`da{#x^P!yP>``I
zjj;okZLiE@dMyyRwGB(VaeR@HpM;<3_WR{p%LK?BwVXj5Y_OP}PzwSmMYD?7_;(f1in(Fb`>>@6VNv^ST`J>*Sc$&v
z)jPRFhBJ;-wPpjU1Kjd?(NK<%R_ce1rSm4>XN$da5l0%C+e7TyBo8`;;V)4P#*Lka
z!SXTTe2u*y9o?ERaEhr{=B~6tg4Q#56>xC%{G9tPpQL!HV1%u!^PppFST9&}3my+j
zYBZc}&+;tJh2GG5HqtTw+(=(J_30%L
zXjNlYrTeqt8+d$IVm3?}$b8Jzh%&8KjnpO83u{s9=<#4prwez*VV|w|(QASJO0OjS
zV>&^?FExw4;NnKJKjmKEkiDxpcps;4t&x3YaWq8S6W}WmN#3T_mS4l3kqTw7Yx`I!
z^OF)&SQR6Oz_<;vmTi(wI%3tzGG!i=A<<{{@qV;^;ElMgr{ToK*dFmxa@bn#gaPv=
z$zae8m#1YSV-h}nU~hXL4mD*WPxlt&hb)N4M1tZR=G2u1y5F37Id4+-PPV^43>=u!
z>*H$qbWTuP(;Z4o+}n7=`U~^wgzN+BKOYG6xdgf=>*Y8e?H<{XjXbgny_;ok`8x=0
zoOVtD#h-Ac-9|dL+6IiQS3nb+U8Hiq7O47*9-g
zv8-7Itt-VBJWv2WU8Xjwg3N96$fi{)1q`E1*Q1_?i$>zILcu=bP`OcE{M`xL+As
zV3%4Rnvm{4f%ljk2HX7Rg+XjU$h4$3^nv|x*K9|Bb`cN^9SSDz`}Hy^&F<_p+9D2XLbTbCC{=?NTk
z8s0KsFo84yqA6w`37`1HKN@l-n|BqZD+GO<<%ewxDSx*W$W<8FE}RLt7!*{feR|dU
z+K$mjTjb5Y=i|C-T=FAKx0bzwf%WpvQ|p6Y>&_UayRF(F5EuX@5a(_G8;QU75E}8N
z0`Lb>V*)#_>J+4kqw28wr}_PldqXVcpf9M%RGvL_?Xz0aBlR;mi}ezEhubbcHMEv0
z9mn=)M(VLl;k#*U*+Zgpsb0pc<4`Z~8?2&DjhKv$I!>+$s8X!G*77MRUuzi)y~gWJ
zcKYVOyX=fRq_zaD=(94gEsRC1gx(3zCdbiM9t3|LAH{VbJqr7o%pX9-9q=Kof;(
zzmVlOJ5>$}nXG;>so-@J+lKNMS-Ol5#J0%|f_s~sFty(`Caa8!QbT`c-JTwgid%G4
zFShrIc_@J?S=k^htyr6YZpvS+7F|GmOf`0j$8A3h_yyvhEzzlIe
za@R8E(EQI0q3g)fFqfqO0gi*)sKYodsiae$OKfwonZs-^kG(xBkkXqLmAV)2)~e4y
zwpi_b2^fshOW+3uVK{esXIO0k%2?F@^w8?ZMzzkh*(vqTqU)Y>4^{0{8+%3l^$sJ`%e@jZnekFd
z0miI*s1x)Yd#XN*qUGilc|R95j6DF2*icPb3BTn$*I0vYRNC(2X|gTA`E4qnbc
zliS}VtqSUm;E8tG5_61Bz<
zn7nW*&w_l$;c91)R(%%Fhsg|W6@{pPb`=yYW!)0N;QgvDFGN8Ct>x!L!%{)-*acc!
z6a%RpXLI#-tCt}9bvoYWQVG3@9@W&6izH3pNoBE-GhW)Rc6+zV)BQirYIc|cwt0xe
z5}L7!Q0|3R854*mfV`E4sZb}pgCC^1j3M=ZiVlky!803{Fr;c1dnu^EcV}go;lq>$
zK{Se$NvsoI0PtxJP(ZZtaBe3)Z-i(5rtPiKV+BIWtdF)1ILuJM#&TCP8%LiX7ti>_
zy)3C-tk`w9U#iJrUR1Rw9TIY7uNBiS8~6iwx!XZ1|HGjbH(DKV7$R&>_;Urq_OjRb
zl;P5x&d<})s#)J6<%_gk+`aE_R*Nu~R7#DQjd7+Kp08M2)5?(II?P!>E43&=(Pa*c
zh>7K7CYHiQV@F0`o4ui}a>JhQFHgNvuSJ&V;3TQ=P2Aonrc2S0W)O7k+ND{aHpyw&?Wo)lBIXCL`%zq*IWA#rQU?C(Z@0B+b7->D24K!u
z?XR>fN}^-;OlC0ZgpR=5Y&ew|>uv8-jX7?O%}
z0C8S7f%r=JhW%@zprm{{##Hp^=E=C(P8NI9AKk>hhRj{u;62+Lv@-NBodL%HZ>sJf
z9W8LWvazXILDk&oqw);`Q7!<>JFshv2Ih<#_8Z<9
zdvVA9VNex!vS*ynej>~zzR>5Z=UIt6PzoalM6mQyt$uSn$25zl0#k+_uxm^eJa_7v
zrNv8ORD^lOoyrdW;KZP2F*-81HjRDvl>%XWR|_scNzUw8sj@brDJ%AuiCD#nM{ub$
zy0$4ZJ#E^xC6_lkz0wrhX7DF4O5z7Le&jPdvAgt@`Mu3Ha4C{&tGit-C0?qRr@pDp
z-wAt_0i>6FeDz)wu|NIsx}o6n$ARE209`!t0Hpb?^;Je^X6}|(K?iBt+a*(;Ov#wh
zHLc>`RA#xT#TbyDW-X>A(1mxy%S?TIm$&&q>z5EfX|l?QgU^8@FDaDIRO3g4GYOdh
z0=`VN)UGS@X0&>%P&FwODCd@!-$Oc7ysGTgV6ei)!xGr+evEbGc<#7$?JH
z!y7$A5_{x)Amn$+s~OeAzM-7O&31!W-P3Pv0~S{>>6^zg#Z!y1U2q@$s}8q+Png(+
z9RkOEW~9n@%HpyRM+G8~Z7u9uP*9My7
zhF-m@Z441MnFbbF?U(`HtTo_iYUJ2-bIuFl0VJnsg3jhK*A?9M-
zcgBn%=?eWQuO<`$JPRmz#7zXqbPw=Hk(BE*fg5!q#2Bt7880#A0Mpjr+Rlvj1!x`_eOkXR<~~S7r>wUH)4pTO8~nEm%wVv@%6lvvalXFf3>k+Cbd!+SmB*>FbC7
zb>#ZE1&>jzVZ-`fRwi4OsX(APNAIpY4r_Bb|2RHar_3>*_4Br&XAT1Wi(jMHMr}zc
z2kpW7kqC{_DNOzE^Ddf8edjTOQ&5
zCFw;`!>`QSl^@@kefE56^kHVRO2Zq+bm7U(!`PO;zTsLPpt!i3AqT6bLS%ZsWvvFN
zm%S#)eF2>G!PO2?RRLKLQ
z(d-KMy{?VHE$dmsFyA*(Stjo$5fO!E8R1h#AF|V~oxmqxMMa(FwgRuFAHQM7QzHyQ
zIq%VM>H?P8zS_=PM)W!Ww_Hu1~eNH*q9wj1BM@HFB1LYG%(a>nMM5@E)oP*Cc=w<(|LbH;ALrujB68!qaoZ4_0A
zkCmKkl=U_Qh31P$OUsT>O#f(W8q!^5Yx!8(cW9|SFv?C6JZKVb_e~;Kg$ERvX33PD
zu?b0uDrqSF0lmj%l@sSk;FQv_w~9Nc>-F@mgm6O2LN{A2A-rKp!C-5-j$B8Vc(VDt
z*&SS&3x;F!pE`T&i-#))+`a44kamuQ*4LtbIhvx#%TwfKSZYzzxCFac0i>tHU`5&}
z*3#O}%G;VH(%OcjmMf&cTwNXPMIy)m+#(P+f#>?CNgmjWHpZ`3hmFO=h4<~Y!2-B3
zd8-Ij;G2(@_3Il2m>}$V7?jJNr`|!GvuHTTpTBZ$TP68|9HeSb{h6hLed{o14|$G0
z4+pVYr!dz0gFVzs2aco^r2zTcig@oL!Iadc1J~dW3DC>=*fl>W1P(R?&aNs|___#q
zCqq^*q4z1DXlN6fR<8k|(XPT+8CAQhWq?^630E)X+d_I!Q+m{`z2}-@HW@h3O#!Yh
z-{8aI&Hp`5um73cepaO@eS=8)EJJg3e*R>L=}TD&S;XCa|6IvVU}}W?E(zJ4cH)<;
zG~av##<1YJrzy(^x`s`}hBYj;$DX!3gjO%kqp!~Q$8BWn5pQsF*8o&~wwLZg^u0KX
zFSX|6QCq6s!icKRO1=a`R$Rk_i3>JjaZvf8OD9I3lUo-g)+x{wb{hQ#M|wyO6mPZT
z@Q(6mc*i3Pl(Pf1O2pW)zu3cgVWlyRM<>3B+OnPWFoIr&f0q&q?boUgzw&xcWh9-T-**4d$mE`uE)lX;R!sm{4$?iOS
zXbfF=*FzHcwa6*}m%U(H%7i#%A!1$sf%M3;%(hTuietcutlMm8?6J1<
zD)gZlwS!$0+g-PB2
z=9BqN?NjQNiSP$Yn=1%e9BBO}zU#`cvP`LS-HqVB+~F|hU*d1f|9AcGSfDLOxSpKS
zJLmfYX%1*S>+OW~!D6)$o4}6^h;0%Z5E9@`eD9NxL~I35eQ^6;+xR{2D0mlNg!4fc
z!@d3eH86V(Fnjmy2mdO7ki5J3dz}xUe8d0qF0LQ3Y3pwA(7bre>7tj%MQ;;NsyBEb
z^fCGdx|sdC`WWKA{U-YiObqq+U@#^a%-y!)<^LcM5ai|S6Z!ua=+9L<1_ZW=HF!t(
z`j{L8iGz<%i1$UrfrEX$I3G~D^WRH<_l@xOcRK1D;2jv^8x)Ap*Ed$kLv8!db)5f0
p9q9H`FHlGA-`gPs`CJV5pnCr&i!>Oa--DV6>@oYJ
Date: Thu, 4 May 2023 19:45:19 +0530
Subject: [PATCH 13/31] added card in index.rst
---
index.rst | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/index.rst b/index.rst
index 48550e6c4ac..ecb64a1aa6e 100644
--- a/index.rst
+++ b/index.rst
@@ -264,6 +264,13 @@ What's new in PyTorch tutorials?
:link: beginner/translation_transformer.html
:tags: Text
+.. customcarditem::
+ :header: Preaparing custom text dataset using Torchtext
+ :card_description: Learn how to use torchtext to prepare a custom dataset
+ :image: _static/img/thumbnails/cropped/torch_text_logo.png
+ :link: beginner/torchtext_custom_dataset_tutorial.html
+ :tags: Text
+
.. Reinforcement Learning
From 09dd73b4ae63bb4b3f77942b5453476fb5f50dc7 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:50:05 +0530
Subject: [PATCH 14/31] added entry in toctree
---
index.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/index.rst b/index.rst
index ecb64a1aa6e..d23ec3a701a 100644
--- a/index.rst
+++ b/index.rst
@@ -877,6 +877,7 @@ Additional Resources
intermediate/seq2seq_translation_tutorial
beginner/text_sentiment_ngrams_tutorial
beginner/translation_transformer
+ beginner/torchtext_custom_dataset_tutorial
.. toctree::
From f7619d73a870b7a25c2a63230a0477eb1978eff1 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 20:06:01 +0530
Subject: [PATCH 15/31] updated Makefile for downloading dataset
---
Makefile | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/Makefile b/Makefile
index a01ea69bb50..fe5cd32e31e 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,10 @@ download:
wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
+ # Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py
+ wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR)
+ unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/
+
docs:
make download
From 1e4163d91a906483212162d6fdc8e46c8da2d637 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 20:07:27 +0530
Subject: [PATCH 16/31] get dataset from data folder
---
beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9e0a3ed1c6e..9281998054e 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -64,7 +64,7 @@
# %%
# Now we will load the dataset
-FILE_PATH = 'fin.txt'
+FILE_PATH = 'data/fin.txt'
dataPipe = dp.iter.IterableWrapper([FILE_PATH])
dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
From 1d96ae31d27a068a4720ed2abc1bbc3613c46c3a Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 22:18:09 +0530
Subject: [PATCH 17/31] updated comment
---
beginner_source/torchtext_custom_dataset_tutorial.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9281998054e..b9ae85fde93 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -9,8 +9,7 @@
will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence
model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
with Neural Networks `_) but using Torchtext 0.15.0 instead
-of a legacy version.
+20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version.
In this tutorial, we will learn how to:
From af728f1893cb7d2bf0ba426b7a9f62922d9d5ced Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 22:57:11 +0530
Subject: [PATCH 18/31] updated function to view results, and added some
sources
---
beginner_source/torchtext_custom_dataset_tutorial.py | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index b9ae85fde93..6a4b6991e3b 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -350,7 +350,7 @@ def apply_padding(pair_of_sequences):
sourceItoS = sourceVocab.get_itos()
targetItoS = targetVocab.get_itos()
-def show_some_transformed_senetnces(data_pipe):
+def show_some_transformed_sentences(data_pipe):
"""
Function to show how the senetnces look like after applying all transforms.
Here we try to print actual words instead of corresponding index
@@ -368,6 +368,11 @@ def show_some_transformed_senetnces(data_pipe):
print(f"Source: {source}")
print(f"Traget: {target}")
break
+
+show_some_transformed_sentences(dataPipe)
# %%
-# In the above output we can observe that the shorter sentences are padded with ``. Now, we can
-# use this dataPipe while writing our training function.
+# In the above output we can observe that the shorter sentences are padded with ``. Now, we\
+# can use this dataPipe while writing our training function.
+#
+# Some parts of this tutorial was inspired from this article: `Click here: `__.
From 519ccbfe51723b2e64c57a123bae265154c66571 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 23:09:42 +0530
Subject: [PATCH 19/31] updated typo
---
beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 6a4b6991e3b..aeb1ef86de4 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -374,5 +374,5 @@ def show_some_transformed_sentences(data_pipe):
# In the above output we can observe that the shorter sentences are padded with ``. Now, we\
# can use this dataPipe while writing our training function.
#
-# Some parts of this tutorial was inspired from this article: `Click here: `__.
From 332356be56c643a7ef1821f10e3fa5ef9e09826e Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:11:53 +0530
Subject: [PATCH 20/31] fixed hyperlinks
---
.../torchtext_custom_dataset_tutorial.py | 27 +++++++++++--------
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index aeb1ef86de4..d22a41eb303 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -9,7 +9,8 @@
will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence
model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
with Neural Networks `_) but without using legacy version.
+20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version
+of torchtext.
In this tutorial, we will learn how to:
@@ -21,16 +22,18 @@
Let us assume that we need to prepare a dataset to train a model that can perform English to
Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
the `Tatoeba Project `_ which can be downloaded from this link: `Click
-Here `__
+Here `__.
+
+Sentence pairs for other languages can be found in this link:
+
+Link: `https://www.manythings.org/anki/ `__
"""
# %%
-# Setup
-# -----
+#Setup
+#-----
#
# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
-# The dataset can be downloaded from this link: `Click Here `__ .
#
# Ensure that following packages are installed:
#
@@ -79,8 +82,9 @@
# of the tab-delimited file
#
# Data pipes can be thought of something like a dataset object, on which
-# we can perform various operations. Check `this tutorial `_ for more details on data pipes.
+# we can perform various operations.
+# Check `this tutorial `_ for more details on
+# data pipes.
#
# We can verify if the iterable has the pair of sentences as shown
# below:
@@ -371,8 +375,9 @@ def show_some_transformed_sentences(data_pipe):
show_some_transformed_sentences(dataPipe)
# %%
-# In the above output we can observe that the shorter sentences are padded with ``. Now, we\
+# In the above output we can observe that the shorter sentences are padded with ``. Now, we
# can use this dataPipe while writing our training function.
#
-# Some parts of this tutorial was inspired from this article: `Click here `__.
+# Some parts of this tutorial was inspired from this article:
+# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
+# `__.
From 44848ff67d565f4ab5869ea30ea0e656b257e05f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:22:00 +0530
Subject: [PATCH 21/31] changed title and introduction
---
beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index d22a41eb303..f4e5350cbd9 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
"""
-Preaparing custom text dataset using Torchtext
+Pre-process custom text dataset using Torchtext
==============================================
**Author**: `Anupam Sharma `_
-This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we
-will be preparing a custom dataset that can be further utilized to train a sequence-to-sequence
+This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial,
+we will pre-process a dataset that can be further utilized to train a sequence-to-sequence
model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
with Neural Networks `_) but without using legacy version
From e007c60402c0e91dd1014b2bdb7e30bd027d493f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:46:14 +0530
Subject: [PATCH 22/31] fixed indentation issue
---
beginner_source/torchtext_custom_dataset_tutorial.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index f4e5350cbd9..e66333d8a15 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -30,8 +30,8 @@
"""
# %%
-#Setup
-#-----
+# Setup
+# -----
#
# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
#
From 66dd97295781d5f911430f886258147cda3754f4 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 02:11:28 +0530
Subject: [PATCH 23/31] fixed typo
---
.../torchtext_custom_dataset_tutorial.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index e66333d8a15..20bea700745 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -241,10 +241,10 @@ def get_transform(vocab):
# the transform:
tempList = list(dataPipe)
-someSetence = tempList[798][0]
+someSentence = tempList[798][0]
print("Some sentence=", end="")
-print(someSetence)
-transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence))
+print(someSentence)
+transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence))
print("Transformed sentence=", end="")
print(transformedSentence)
indexToString = sourceVocab.get_itos()
@@ -254,7 +254,7 @@ def get_transform(vocab):
# %%
# In the above code,:
#
-# * At line 2, we take a source setence from list that we created from dataPipe at line 1
+# * At line 2, we take a source sentence from list that we created from dataPipe at line 1
# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
# sentence. Note that transforms take list of words and not a sentence.
# * At line 8, we get the mapping of index to string and then use it get the transformed
@@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe):
# In the above output we can observe that the shorter sentences are padded with ``. Now, we
# can use this dataPipe while writing our training function.
#
-# Some parts of this tutorial was inspired from this article:
-# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
-# `__.
+# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0
+# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
+# `__.
From 190e4a1fadda8ff3d22922c8cc1bdf11f4642207 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 02:27:45 +0530
Subject: [PATCH 24/31] fixed typo
---
beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 20bea700745..d6e6cdcf3d5 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe):
# In the above output we can observe that the shorter sentences are padded with ``. Now, we
# can use this dataPipe while writing our training function.
#
-# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0
-# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
-# `__.
+# Some parts of this tutorial was inspired from this article:
+# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\
+# `__.
From ba5efff2f15c235633f39ea86d06561944dcec1b Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:26:57 +0530
Subject: [PATCH 25/31] replaced Finninsh with German as spacy German model is
already there in build
---
.../torchtext_custom_dataset_tutorial.py | 30 +++++++++----------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index d6e6cdcf3d5..ad749d9fefb 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -20,9 +20,9 @@
* Perform bucket batching
Let us assume that we need to prepare a dataset to train a model that can perform English to
-Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
+German translation. We will use a tab-delimited German - English sentence pairs provided by
the `Tatoeba Project `_ which can be downloaded from this link: `Click
-Here `__.
+Here `__.
Sentence pairs for other languages can be found in this link:
@@ -33,7 +33,7 @@
# Setup
# -----
#
-# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
+# First, download the dataset, extract the zip, and note the path to the file `deu.txt`.
#
# Ensure that following packages are installed:
#
@@ -47,10 +47,10 @@
# convert a sentence to list of words. Spacy is a python package used for various Natural
# Language Processing (NLP) tasks.
#
-# Download the English and Finnish models from spacy as shown below: ::
+# Download the English and German models from spacy as shown below: ::
#
# python -m spacy download en_core_web_sm
-# python -m spacy download fi_core_news_sm
+# python -m spacy download de_core_news_sm
# %%
@@ -61,12 +61,12 @@
import spacy
from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
-fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing
+de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing
# %%
# Now we will load the dataset
-FILE_PATH = 'data/fin.txt'
+FILE_PATH = 'data/deu.txt'
dataPipe = dp.iter.IterableWrapper([FILE_PATH])
dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
@@ -123,23 +123,23 @@ def eng_tokenize(text):
"""
return [token.text for token in eng.tokenizer(text)]
-def fin_tokenize(text):
+def de_tokenize(text):
"""
- Tokenize a Finnish text and returns list of tokens
+ Tokenize a German text and returns list of tokens
"""
- return [token.text for token in fin.tokenizer(text)]
+ return [token.text for token in de.tokenizer(text)]
# %%
# Above function accepts a text and returns a list of words
# as shown below:
print(eng_tokenize("Have a good day!!!"))
-print(fin_tokenize("Hyvää päivänjatkoa!!!"))
+print(de_tokenize("Haben Sie einen guten Tag!!!"))
# %%
# Building the vocabulary
# -----------------------
-# Let us consider an English sentence as the source and a Finnish sentence as the target.
+# Let us consider an English sentence as the source and a German sentence as the target.
#
# Vocabulary can be considered as the set of unique words we have in the dataset.
# We will build vocabulary for both our source and target now.
@@ -153,11 +153,11 @@ def get_tokens(data_iter, place):
tuple of sentences (source and target), `place` parameters defines for which
index to return the tokens for. `place=0` for source and `place=1` for target
"""
- for english, finnish in data_iter:
+ for english, german in data_iter:
if place == 0:
yield eng_tokenize(english)
else:
- yield fin_tokenize(finnish)
+ yield de_tokenize(german)
# %%
# Now, we will build vocabulary for source:
@@ -270,7 +270,7 @@ def apply_transform(sequence_pair):
return (
get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
- get_transform(targetVocab)(fin_tokenize(sequence_pair[1]))
+ get_transform(targetVocab)(de_tokenize(sequence_pair[1]))
)
dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
tempList = list(dataPipe)
From 8b105b6f905361ff7024ed1462e060e3571311f1 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:30:18 +0530
Subject: [PATCH 26/31] fixed issue in title
---
beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
index.rst | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index ad749d9fefb..e4ec3b48100 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""
Pre-process custom text dataset using Torchtext
-==============================================
+===============================================
**Author**: `Anupam Sharma `_
diff --git a/index.rst b/index.rst
index d23ec3a701a..6e6d687d0c1 100644
--- a/index.rst
+++ b/index.rst
@@ -265,7 +265,7 @@ What's new in PyTorch tutorials?
:tags: Text
.. customcarditem::
- :header: Preaparing custom text dataset using Torchtext
+ :header: Pre-process custom text dataset using Torchtext
:card_description: Learn how to use torchtext to prepare a custom dataset
:image: _static/img/thumbnails/cropped/torch_text_logo.png
:link: beginner/torchtext_custom_dataset_tutorial.html
From 409b29d046dc904d2e5f1905e8f071276e8250c2 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:31:38 +0530
Subject: [PATCH 27/31] use another dataset
---
Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index fe5cd32e31e..ed0ade00465 100644
--- a/Makefile
+++ b/Makefile
@@ -107,8 +107,8 @@ download:
tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
# Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py
- wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR)
- unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/
+ wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR)
+ unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/
docs:
From 8925bcba4e9c9282ecf7eae320e519f8a4b8f32f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Tue, 16 May 2023 18:43:05 +0530
Subject: [PATCH 28/31] addressed review comments for PR #2307
---
.../torchtext_custom_dataset_tutorial.py | 164 +++++++++---------
1 file changed, 81 insertions(+), 83 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index e4ec3b48100..1439615050b 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -21,12 +21,11 @@
Let us assume that we need to prepare a dataset to train a model that can perform English to
German translation. We will use a tab-delimited German - English sentence pairs provided by
-the `Tatoeba Project `_ which can be downloaded from this link: `Click
-Here `__.
+the `Tatoeba Project `_ which can be downloaded from
+`this link `__.
-Sentence pairs for other languages can be found in this link:
-
-Link: `https://www.manythings.org/anki/ `__
+Sentence pairs for other languages can be found in `this link `\
+__.
"""
# %%
@@ -37,11 +36,11 @@
#
# Ensure that following packages are installed:
#
-# * `Torchdata 0.6.0 `_ (Installation instructions: `C\
-# lick here `__)
-# * `Torchtext 0.15.0 `_ (Installation instructions:\
-# `Click here `__)
-# * Spacy (Docs: `Click here `__)
+# * `Torchdata 0.6.0 `_ (`Installation instructions \
+# `__)
+# * `Torchtext 0.15.0 `_ (`Installation instructions \
+# `__)
+# * `Spacy `__
#
# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to
# convert a sentence to list of words. Spacy is a python package used for various Natural
@@ -67,9 +66,9 @@
# Now we will load the dataset
FILE_PATH = 'data/deu.txt'
-dataPipe = dp.iter.IterableWrapper([FILE_PATH])
-dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
-dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
+data_pipe = dp.iter.IterableWrapper([FILE_PATH])
+data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
+data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
# %%
# In the above code block, we are doing following things:
@@ -81,15 +80,15 @@
# again returns an iterable of tuples representing each rows
# of the tab-delimited file
#
-# Data pipes can be thought of something like a dataset object, on which
+# DataPipes can be thought of something like a dataset object, on which
# we can perform various operations.
# Check `this tutorial `_ for more details on
-# data pipes.
+# DataPipes.
#
# We can verify if the iterable has the pair of sentences as shown
# below:
-for sample in dataPipe:
+for sample in data_pipe:
print(sample)
break
@@ -97,35 +96,35 @@
# Note that we also have attribution details along with pair of sentences. We will
# write a small function to remove the attribution details:
-def remove_attribution(row):
+def removeAttribution(row):
"""
Function to keep the first two elements in a tuple
"""
return row[:2]
-dataPipe = dataPipe.map(remove_attribution)
+data_pipe = data_pipe.map(removeAttribution)
# %%
-# The `map` function at line 2 in above code block can be used to apply some function
-# on each elements of data pipe. Now, we can verify that the data pipe only contains
+# The `map` function at line 6 in above code block can be used to apply some function
+# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains
# pair of sentences.
-for sample in dataPipe:
+for sample in data_pipe:
print(sample)
break
# %%
# Now, let us define few functions to perform tokenization:
-def eng_tokenize(text):
+def engTokenize(text):
"""
- Tokenize an English text and returns list of tokens
+ Tokenize an English text and return a list of tokens
"""
return [token.text for token in eng.tokenizer(text)]
-def de_tokenize(text):
+def deTokenize(text):
"""
- Tokenize a German text and returns list of tokens
+ Tokenize a German text and return a list of tokens
"""
return [token.text for token in de.tokenizer(text)]
@@ -133,8 +132,8 @@ def de_tokenize(text):
# Above function accepts a text and returns a list of words
# as shown below:
-print(eng_tokenize("Have a good day!!!"))
-print(de_tokenize("Haben Sie einen guten Tag!!!"))
+print(engTokenize("Have a good day!!!"))
+print(deTokenize("Haben Sie einen guten Tag!!!"))
# %%
# Building the vocabulary
@@ -145,9 +144,9 @@ def de_tokenize(text):
# We will build vocabulary for both our source and target now.
#
# Let us define a function to get tokens from elements of tuples in the iterator.
-# The comments within the function specifies the need and working of it:
-def get_tokens(data_iter, place):
+
+def getTokens(data_iter, place):
"""
Function to yield tokens from an iterator. Since, our iterator contains
tuple of sentences (source and target), `place` parameters defines for which
@@ -155,25 +154,25 @@ def get_tokens(data_iter, place):
"""
for english, german in data_iter:
if place == 0:
- yield eng_tokenize(english)
+ yield engTokenize(english)
else:
- yield de_tokenize(german)
+ yield deTokenize(german)
# %%
# Now, we will build vocabulary for source:
-sourceVocab = build_vocab_from_iterator(
- get_tokens(dataPipe,0),
+source_vocab = build_vocab_from_iterator(
+ getTokens(data_pipe,0),
min_freq=2,
specials= ['', '', '', ''],
special_first=True
)
-sourceVocab.set_default_index(sourceVocab[''])
+source_vocab.set_default_index(source_vocab[''])
# %%
# The code above, builds the vocabulary from the iterator. In the above code block:
#
-# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for
+# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for
# source sentences.
# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs
# less than 2 times.
@@ -194,23 +193,23 @@ def get_tokens(data_iter, place):
#
# Similarly, we will build vocabulary for target sentences:
-targetVocab = build_vocab_from_iterator(
- get_tokens(dataPipe,1),
+target_vocab = build_vocab_from_iterator(
+ getTokens(data_pipe,1),
min_freq=2,
specials= ['', '', '', ''],
special_first=True
)
-targetVocab.set_default_index(targetVocab[''])
+target_vocab.set_default_index(target_vocab[''])
# %%
# Note that the example above shows how can we add special tokens to our vocabulary. The
# special tokens may change based on the requirements.
#
# Now, we can verify that special tokens are placed at the beginning and then other words.
-# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on
+# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on
# vocabulary.
-print(sourceVocab.get_itos()[:9])
+print(source_vocab.get_itos()[:9])
# %%
# Numericalize sentences using vocabulary
@@ -218,7 +217,7 @@ def get_tokens(data_iter, place):
# After building the vocabulary, we need to convert our sentences to corresponding indices.
# Let us define some functions for this:
-def get_transform(vocab):
+def getTransform(vocab):
"""
Create transforms based on given vocabulary. The returned transform is applied to sequence
of tokens.
@@ -237,55 +236,55 @@ def get_transform(vocab):
# %%
# Now, let us see how to use the above function. The function returns an object of `Transforms`
-# which we will use on our sentence. Let us take a random sentence and check the working of
-# the transform:
+# which we will use on our sentence. Let us take a random sentence and check how the transform
+# works.
-tempList = list(dataPipe)
-someSentence = tempList[798][0]
+temp_list = list(data_pipe)
+some_sentence = temp_list[798][0]
print("Some sentence=", end="")
-print(someSentence)
-transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence))
+print(some_sentence)
+transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence=", end="")
-print(transformedSentence)
-indexToString = sourceVocab.get_itos()
-for index in transformedSentence:
- print(indexToString[index], end=" ")
+print(transformed_sentence)
+index_to_string = source_vocab.get_itos()
+for index in transformed_sentence:
+ print(index_to_string[index], end=" ")
# %%
# In the above code,:
#
-# * At line 2, we take a source sentence from list that we created from dataPipe at line 1
-# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
-# sentence. Note that transforms take list of words and not a sentence.
-# * At line 8, we get the mapping of index to string and then use it get the transformed
-# sentence
+# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1
+# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
+# sentence. Note that transforms take list of words and not a sentence.
+# * At line 8, we get the mapping of index to string and then use it get the transformed
+# sentence
#
-# Now we will use functions of `dataPipe` to apply transform to all our sentences.
+# Now we will use DataPipe functions to apply transform to all our sentences.
# Let us define some more functions for this.
-def apply_transform(sequence_pair):
+def applyTransform(sequence_pair):
"""
Apply transforms to sequence of tokens in a sequence pair
"""
return (
- get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
- get_transform(targetVocab)(de_tokenize(sequence_pair[1]))
+ getTransform(source_vocab)(engTokenize(sequence_pair[0])),
+ getTransform(target_vocab)(deTokenize(sequence_pair[1]))
)
-dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
-tempList = list(dataPipe)
-print(tempList[0])
+data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
+temp_list = list(data_pipe)
+print(temp_list[0])
# %%
# Make batches (with bucket batch)
# --------------------------------
# Generally, we train models in batches. While working for sequence to sequence models, it is
# recommended to keep the length of sequences in a batch similar. For that we will use
-# `bucketbatch` function of `dataPipe`.
+# `bucketbatch` function of `data_pipe`.
#
# Let us define some functions that will be used by the `bucketbatch` function.
-def sort_bucket(bucket):
+def sortBucket(bucket):
"""
Function to sort a given bucket. Here, we want to sort based on the length of
source and target sequence.
@@ -295,9 +294,9 @@ def sort_bucket(bucket):
# %%
# Now, we will apply the `bucketbatch` function:
-dataPipe = dataPipe.bucketbatch(
+data_pipe = data_pipe.bucketbatch(
batch_size = 4, batch_num=5, bucket_num=1,
- use_in_batch_shuffle=False, sort_key=sort_bucket
+ use_in_batch_shuffle=False, sort_key=sortBucket
)
# %%
@@ -310,14 +309,14 @@ def sort_bucket(bucket):
#
# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`.
# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`.
-# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
+# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
-print(list(dataPipe)[0])
+print(list(data_pipe)[0])
# %%
# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`.
# For this we will write a small function:
-def separate_source_target(sequence_pairs):
+def separateSourceTarget(sequence_pairs):
"""
input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
@@ -326,8 +325,8 @@ def separate_source_target(sequence_pairs):
return sources,targets
## Apply the function to each element in the iterator
-dataPipe = dataPipe.map(separate_source_target)
-print(list(dataPipe)[0])
+data_pipe = data_pipe.map(separateSourceTarget)
+print(list(data_pipe)[0])
# %%
# Now, we have the data as desired.
@@ -337,7 +336,7 @@ def separate_source_target(sequence_pairs):
# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to
# make all the sequences in a batch of equal length. We can perform padding as follows:
-def apply_padding(pair_of_sequences):
+def applyPadding(pair_of_sequences):
"""
Convert sequnces to tensors and apply padding
"""
@@ -345,16 +344,16 @@ def apply_padding(pair_of_sequences):
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `` token in the
# vocabulary.
-dataPipe = dataPipe.map(apply_padding)
+data_pipe = data_pipe.map(applyPadding)
# %%
# Now, we can use the index to string mapping to see how the sequence would look with tokens
# instead of indices:
-sourceItoS = sourceVocab.get_itos()
-targetItoS = targetVocab.get_itos()
+source_index_to_string = source_vocab.get_itos()
+target_index_to_string = target_vocab.get_itos()
-def show_some_transformed_sentences(data_pipe):
+def showSomeTransformedSentences(data_pipe):
"""
Function to show how the senetnces look like after applying all transforms.
Here we try to print actual words instead of corresponding index
@@ -365,19 +364,18 @@ def show_some_transformed_sentences(data_pipe):
for i in range(4):
source = ""
for token in sources[i]:
- source += " " + sourceItoS[token]
+ source += " " + source_index_to_string[token]
target = ""
for token in targets[i]:
- target += " " + targetItoS[token]
+ target += " " + target_index_to_string[token]
print(f"Source: {source}")
print(f"Traget: {target}")
break
-show_some_transformed_sentences(dataPipe)
+showSomeTransformedSentences(data_pipe)
# %%
# In the above output we can observe that the shorter sentences are padded with ``. Now, we
-# can use this dataPipe while writing our training function.
+# can use `data_pipe` while writing our training function.
#
-# Some parts of this tutorial was inspired from this article:
-# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\
+# Some parts of this tutorial was inspired from `this article
# `__.
From 9f71e3fa81569c001c2284600e8295c2cfdd9848 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Tue, 16 May 2023 22:45:13 +0530
Subject: [PATCH 29/31] corrected spelling mistakes
---
.../torchtext_custom_dataset_tutorial.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 1439615050b..05d3c0b8625 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
"""
-Pre-process custom text dataset using Torchtext
+Preprocess custom text dataset using Torchtext
===============================================
**Author**: `Anupam Sharma `_
This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial,
-we will pre-process a dataset that can be further utilized to train a sequence-to-sequence
+we will preprocess a dataset that can be further utilized to train a sequence-to-sequence
model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
with Neural Networks `_) but without using legacy version
@@ -46,7 +46,7 @@
# convert a sentence to list of words. Spacy is a python package used for various Natural
# Language Processing (NLP) tasks.
#
-# Download the English and German models from spacy as shown below: ::
+# Download the English and German models from Spacy as shown below: ::
#
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
@@ -179,7 +179,7 @@ def getTokens(data_iter, place):
# * At line 4, we specify some special tokens:
#
# * `` for start of sentence
-# * `` for end of senetence
+# * `` for end of sentence
# * `` for unknown words. An example of unknown word is the one skipped because of
# `min_freq=2`.
# * `` is the padding token. While training, a model we mostly train in batches. In a
@@ -189,7 +189,7 @@ def getTokens(data_iter, place):
# * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1,
# `` index 2, and will get index 3 in the vocabulary.
# * At line 7, we set default index as index of ``. That means if some word is not in
-# vocbulary, we will use `` instead of that unknown word.
+# vocabulary, we will use `` instead of that unknown word.
#
# Similarly, we will build vocabulary for target sentences:
@@ -338,7 +338,7 @@ def separateSourceTarget(sequence_pairs):
def applyPadding(pair_of_sequences):
"""
- Convert sequnces to tensors and apply padding
+ Convert sequences to tensors and apply padding
"""
return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
@@ -355,7 +355,7 @@ def applyPadding(pair_of_sequences):
def showSomeTransformedSentences(data_pipe):
"""
- Function to show how the senetnces look like after applying all transforms.
+ Function to show how the sentences look like after applying all transforms.
Here we try to print actual words instead of corresponding index
"""
for sources,targets in data_pipe:
From 12b644072d9f720123ae27639f8eb9adc3cbbee5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Wed, 17 May 2023 00:57:40 +0530
Subject: [PATCH 30/31] followed pyspelling's configuration for the shell
commands
---
.../torchtext_custom_dataset_tutorial.py | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 05d3c0b8625..9875d8aa43a 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -46,10 +46,13 @@
# convert a sentence to list of words. Spacy is a python package used for various Natural
# Language Processing (NLP) tasks.
#
-# Download the English and German models from Spacy as shown below: ::
+# Download the English and German models from Spacy as shown below:
+#
+# .. code-block:: shell
+#
+# python -m spacy download en_core_web_sm
+# python -m spacy download de_core_news_sm
#
-# python -m spacy download en_core_web_sm
-# python -m spacy download de_core_news_sm
# %%
@@ -59,8 +62,8 @@
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
-eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
-de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing
+eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
+de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text
# %%
# Now we will load the dataset
From f48bc259ccc54adc7413ec985066606a62cb66e5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Wed, 17 May 2023 00:59:24 +0530
Subject: [PATCH 31/31] added words used in
beginner_source/torchtext_custom_dataset_tutorial.py
---
en-wordlist.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/en-wordlist.txt b/en-wordlist.txt
index fdf5df67d8d..20b63dadc2d 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -30,6 +30,8 @@ DDP
DDQN
DNN
DQN
+DataPipe
+DataPipes
DataLoaders
DeepMind
DeiT
@@ -126,6 +128,7 @@ SciPy
Sequentials
Sigmoid
SoTA
+Spacy
TPU
TensorBoard
TextVQA
@@ -345,6 +348,7 @@ timestep
timesteps
tokenization
tokenize
+tokenized
tokenizer
tokenizes
tooltip