From f24a95983110a25d1c4237e95b90907ff0ea3589 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:16:55 +0530
Subject: [PATCH 01/31] added intro

---
 .../torchtext_custom_dataset_tutorial.py      | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 beginner_source/torchtext_custom_dataset_tutorial.py

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
new file mode 100644
index 00000000000..daa6f39a28c
--- /dev/null
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Preaparing custom text dataset using Torchtext
+==============================================
+
+**Author**: `Anupam Sharma <https://anp-scp.github.io/>`_
+
+This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we
+will be preparing a  custom dataset that can be further utilized to train a sequence-to-sequence
+model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
+with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\
+20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but using Torchtext 0.15.0 instead
+of a legacy version.
+
+In this tutorial, we will learn how to:
+
+* Read a dataset
+* Tokenize sentence
+* Apply transforms to sentence
+* Perform bucket batching
+
+Let us assume that we need to prepare a dataset to train a model that can perform English to
+Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
+the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from this link: `Click
+Here <https://www.manythings.org/anki/fin-eng.zip>`__
+"""
+

From a0ac5d5a93fd3b149dd1fe3aecebf4681d5ee7e5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:18:10 +0530
Subject: [PATCH 02/31] added setup section

---
 .../torchtext_custom_dataset_tutorial.py      | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index daa6f39a28c..62b7735ba47 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -25,3 +25,27 @@
 Here <https://www.manythings.org/anki/fin-eng.zip>`__
 """
 
+# %%
+# Setup
+# -----
+#
+# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
+# The dataset can be downloaded from this link: `Click Here <https://www.manythings.org/anki/fin\
+# -eng.zip>`__ .
+#
+# Ensure that following packages are installed:
+#
+# * `Torchdata 0.6.0 <https://pytorch.org/data/beta/index.html>`_ (Installation instructions: `C\
+#   lick here <https://github.com/pytorch/data>`__)
+# * `Torchtext 0.15.0 <https://pytorch.org/text/stable/index.html>`_ (Installation instructions:\
+#   `Click here <https://github.com/pytorch/text>`__)
+# * Spacy (Docs: `Click here <https://spacy.io/usage>`__)
+#
+# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to
+# convert a sentence to list of words. Spacy is a python package used for various Natural
+# Language Processing (NLP) tasks.
+#
+# Download the English and Finnish models from spacy as shown below: ::
+#
+#   python -m spacy download en_core_web_sm
+#   python -m spacy download fi_core_news_sm

From d1a010b74f56928a8853daa86b3bae3c44f91da0 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:20:19 +0530
Subject: [PATCH 03/31] import packages and read dataset

---
 .../torchtext_custom_dataset_tutorial.py      | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 62b7735ba47..14f88da3511 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -49,3 +49,43 @@
 #
 #   python -m spacy download en_core_web_sm
 #   python -m spacy download fi_core_news_sm
+
+
+# %%
+# Let us start by importing required modules:
+
+import torchdata.datapipes as dp
+import torchtext.transforms as T
+import spacy
+from torchtext.vocab import build_vocab_from_iterator
+eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
+fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing
+
+# %%
+# Now we will load the dataset
+
+FILE_PATH = 'fin.txt'
+dataPipe = dp.iter.IterableWrapper([FILE_PATH])
+dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
+dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
+
+# %%
+# In the above code block, we are doing following things:
+#
+# 1. At line 2, we are creating an iterable of filenames
+# 2. At line 3, we pass the iterable to `FileOpener` which then
+#    opens the file in read mode
+# 3. At line 4, we call a function to parse the file, which
+#    again returns an iterable of tuples representing each rows
+#    of the tab-delimited file
+#
+# Data pipes can be thought of something like a dataset object, on which
+# we can perform various operations. Check `this tutorial <https://pytorch.org\
+# /data/beta/dp_tutorial.html>`_ for more details on data pipes.
+#
+# We can verify if the iterable has the pair of sentences as shown
+# below:
+
+for sample in dataPipe:
+    print(sample)
+    break

From b8d1dfa27fc63f00ac03007c681b258b3956a891 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:21:36 +0530
Subject: [PATCH 04/31] remove attributions from dataset

---
 .../torchtext_custom_dataset_tutorial.py      | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 14f88da3511..9cf9441c640 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -89,3 +89,25 @@
 for sample in dataPipe:
     print(sample)
     break
+
+# %%
+# Note that we also have attribution details along with pair of sentences. We will
+# write a small function to remove the attribution details:
+
+def remove_attribution(row):
+    """
+    Function to keep the first two elements in a tuple
+    """
+    return row[:2]
+dataPipe = dataPipe.map(remove_attribution)
+
+# %%
+# The `map` function at line 2 in above code block can be used to apply some function
+# on each elements of data pipe. Now, we can verify that the data pipe only contains
+# pair of sentences.
+
+
+for sample in dataPipe:
+    print(sample)
+    break
+

From da7d346bcc238b4b0389d30eb0a3272253a53f4f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:22:46 +0530
Subject: [PATCH 05/31] added functions for tokenization

---
 .../torchtext_custom_dataset_tutorial.py      | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9cf9441c640..33da9be38ad 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -111,3 +111,25 @@ def remove_attribution(row):
     print(sample)
     break
 
+# %%
+# Now, let us define few functions to perform tokenization:
+
+def eng_tokenize(text):
+    """
+    Tokenize an English text and returns list of tokens
+    """
+    return [token.text for token in eng.tokenizer(text)]
+
+def fin_tokenize(text):
+    """
+    Tokenize a Finnish text and returns list of tokens
+    """
+    return [token.text for token in fin.tokenizer(text)]
+
+# %%
+# Above function accepts a text and returns a list of words
+# as shown below:
+
+print(eng_tokenize("Have a good day!!!"))
+print(fin_tokenize("Hyvää päivänjatkoa!!!"))
+

From 86f56e10f9b8b5949b3a78eb3009270a3e786015 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:23:48 +0530
Subject: [PATCH 06/31] building the vocabulary

---
 .../torchtext_custom_dataset_tutorial.py      | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 33da9be38ad..5aa6f8d8bed 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -133,3 +133,69 @@ def fin_tokenize(text):
 print(eng_tokenize("Have a good day!!!"))
 print(fin_tokenize("Hyvää päivänjatkoa!!!"))
 
+# %%
+# Building the vocabulary
+# -----------------------
+# Let us consider an English sentence as the source and a Finnish sentence as the target.
+#
+# Vocabulary can be considered as the set of unique words we have in the dataset.
+# We will build vocabulary for both our source and target now.
+#
+# Let us define a function to get tokens from elements of tuples in the iterator.
+# The comments within the function specifies the need and working of it:
+
+def get_tokens(data_iter, place):
+    """
+    Function to yield tokens from an iterator. Since, our iterator contains
+    tuple of sentences (source and target), `place` parameters defines for which
+    index to return the tokens for. `place=0` for source and `place=1` for target
+    """
+    for english, finnish in data_iter:
+        if place == 0:
+            yield eng_tokenize(english)
+        else:
+            yield fin_tokenize(finnish)
+
+# %%
+# Now, we will build vocabulary for source:
+
+sourceVocab = build_vocab_from_iterator(
+    get_tokens(dataPipe,0),
+    min_freq=2,
+    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
+    special_first=True
+)
+sourceVocab.set_default_index(sourceVocab['<unk>'])
+
+# %%
+# The code above, builds the vocabulary from the iterator. In the above code block:
+#
+# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for
+#   source sentences.
+# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs
+#   less than 2 times.
+# * At line 4, we specify some special tokens:
+#
+#   * `<sos>` for start of sentence
+#   * `<eos>` for end of senetence
+#   * `<unk>` for unknown words. An example of unknown word is the one skipped because of
+#     `min_freq=2`.
+#   * `<pad>` is the padding token. While training, a model we mostly train in batches. In a
+#     batch, there can be sentences of different length. So, we pad the shorter sentences with
+#     `<pad>` token to make length of all sequences in the batch equal.
+#
+# * At line 5, we set `special_first=True`. Which means `<pad>` will get index 0, `<sos>` index 1,
+#   `<eos>` index 2, and <unk> will get index 3 in the vocabulary.
+# * At line 7, we set default index as index of `<unk>`. That means if some word is not in
+#   vocbulary, we will use `<unk>` instead of that unknown word.
+#
+# Similarly, we will build vocabulary for target sentences:
+
+targetVocab = build_vocab_from_iterator(
+    get_tokens(dataPipe,1),
+    min_freq=2,
+    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
+    special_first=True
+)
+targetVocab.set_default_index(targetVocab['<unk>'])
+

From 10e01a819b47f9dd93a5c6bfa27a9442a230776a Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:25:04 +0530
Subject: [PATCH 07/31] added some comments

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 5aa6f8d8bed..004481403d9 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -199,3 +199,13 @@ def get_tokens(data_iter, place):
 )
 targetVocab.set_default_index(targetVocab['<unk>'])
 
+# %%
+# Note that the example above shows how can we add special tokens to our vocabulary. The
+# special tokens may change based on the requirements.
+#
+# Now, we can verify that special tokens are placed at the beginning and then other words.
+# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on
+# vocabulary.
+
+print(sourceVocab.get_itos()[:9])
+

From 7dfa29f3db08949e2dd9400e3310fe342b38f29d Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:25:58 +0530
Subject: [PATCH 08/31] Numericalize sentences using vocabulary

---
 .../torchtext_custom_dataset_tutorial.py      | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 004481403d9..beab7ee8641 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -209,3 +209,67 @@ def get_tokens(data_iter, place):
 
 print(sourceVocab.get_itos()[:9])
 
+# %%
+# Numericalize sentences using vocabulary
+# ---------------------------------------
+# After building the vocabulary, we need to convert our sentences to corresponding indices.
+# Let us define some functions for this:
+
+def get_transform(vocab):
+    """
+    Create transforms based on given vocabulary. The returned transform is applied to sequence
+    of tokens.
+    """
+    text_tranform = T.Sequential(
+        ## converts the sentences to indices based on given vocabulary
+        T.VocabTransform(vocab=vocab),
+        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
+        # 1 as seen in previous section
+        T.AddToken(1, begin=True),
+        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
+        # 2 as seen in previous section
+        T.AddToken(2, begin=False)
+    )
+    return text_tranform
+
+# %%
+# Now, let us see how to use the above function. The function returns an object of `Transforms`
+# which we will use on our sentence. Let us take a random sentence and check the working of
+# the transform:
+
+tempList = list(dataPipe)
+someSetence = tempList[798][0]
+print("Some sentence=", end="")
+print(someSetence)
+transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence))
+print("Transformed sentence=", end="")
+print(transformedSentence)
+indexToString = sourceVocab.get_itos()
+for index in transformedSentence:
+    print(indexToString[index], end=" ")
+
+# %%
+# In the above code,:
+#
+#   * At line 2, we take a source setence from list that we created from dataPipe at line 1
+#   * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
+#     sentence. Note that transforms take list of words and not a sentence.
+#   * At line 8, we get the mapping of index to string and then use it get the transformed
+#     sentence
+#
+# Now we will use functions of `dataPipe` to apply transform to all our sentences.
+# Let us define some more functions for this.
+
+def apply_transform(sequence_pair):
+    """
+    Apply transforms to sequence of tokens in a sequence pair
+    """
+
+    return (
+        get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
+        get_transform(targetVocab)(fin_tokenize(sequence_pair[1]))
+    )
+dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
+tempList = list(dataPipe)
+print(tempList[0])
+

From d9dea745c6261c3a87d2ab3cb6e1795f68a73ac7 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:27:06 +0530
Subject: [PATCH 09/31] bucket batching

---
 .../torchtext_custom_dataset_tutorial.py      | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index beab7ee8641..1e7314d1013 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -273,3 +273,56 @@ def apply_transform(sequence_pair):
 tempList = list(dataPipe)
 print(tempList[0])
 
+# %%
+# Make batches (with bucket batch)
+# --------------------------------
+# Generally, we train models in batches. While working for sequence to sequence models, it is
+# recommended to keep the length of sequences in a batch similar. For that we will use
+# `bucketbatch` function of `dataPipe`.
+#
+# Let us define some functions that will be used by the `bucketbatch` function.
+
+def sort_bucket(bucket):
+    """
+    Function to sort a given bucket. Here, we want to sort based on the length of
+    source and target sequence.
+    """
+    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))
+
+# %%
+# Now, we will apply the `bucketbatch` function:
+
+dataPipe = dataPipe.bucketbatch(
+    batch_size = 4, batch_num=5,  bucket_num=1,
+    use_in_batch_shuffle=False, sort_key=sort_bucket
+)
+
+# %%
+# In the above code block:
+#
+#   * We keep batch size = 4.
+#   * `batch_num` is the number of batches to keep in a bucket
+#   * `bucket_num` is the number of buckets to keep in a pool for shuffling
+#   * `sort_key` specifies the function that takes a bucket and sorts it
+#
+# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`.
+# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`.
+# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
+
+print(list(dataPipe)[0])
+# %%
+# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`.
+# For this we will write a small function:
+
+def separate_source_target(sequence_pairs):
+    """
+    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
+    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
+    """
+    sources,targets = zip(*sequence_pairs)
+    return sources,targets
+
+## Apply the function to each element in the iterator
+dataPipe = dataPipe.map(separate_source_target)
+print(list(dataPipe)[0])
+

From 2dd7f197fd9cdc5e30b6cee3be0250d8cf284da9 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:27:57 +0530
Subject: [PATCH 10/31] applied padding

---
 .../torchtext_custom_dataset_tutorial.py       | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 1e7314d1013..a6d42ea999f 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -326,3 +326,21 @@ def separate_source_target(sequence_pairs):
 dataPipe = dataPipe.map(separate_source_target)
 print(list(dataPipe)[0])
 
+# %%
+# Now, we have the data as desired.
+#
+# Padding
+# -------
+# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to
+# make all the sequences in a batch of equal length. We can perform padding as follows:
+
+def apply_padding(pair_of_sequences):
+    """
+    Convert sequnces to tensors and apply padding
+    """
+    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
+## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
+# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
+# vocabulary.
+dataPipe = dataPipe.map(apply_padding)
+

From 1fec4b5aa0cc4b6565417420e76e6c7c02e05735 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:31:18 +0530
Subject: [PATCH 11/31] view the final result

---
 .../torchtext_custom_dataset_tutorial.py      | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index a6d42ea999f..9e0a3ed1c6e 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -344,3 +344,31 @@ def apply_padding(pair_of_sequences):
 # vocabulary.
 dataPipe = dataPipe.map(apply_padding)
 
+# %%
+# Now, we can use the index to string mapping to see how the sequence would look with tokens
+# instead of indices:
+
+sourceItoS = sourceVocab.get_itos()
+targetItoS = targetVocab.get_itos()
+
+def show_some_transformed_senetnces(data_pipe):
+    """
+    Function to show how the senetnces look like after applying all transforms.
+    Here we try to print actual words instead of corresponding index
+    """
+    for sources,targets in data_pipe:
+        if sources[0][-1] != 0:
+            continue # Just to visualize padding of shorter sentences
+        for i in range(4):
+            source = ""
+            for token in sources[i]:
+                source += " " + sourceItoS[token]
+            target = ""
+            for token in targets[i]:
+                target += " " + targetItoS[token]
+            print(f"Source: {source}")
+            print(f"Traget: {target}")
+        break
+# %%
+# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we can
+# use this dataPipe while writing our training function.

From 158c5fd2ad40028dea67a48152ffbe6927729c16 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:39:16 +0530
Subject: [PATCH 12/31] added torchtext logo

---
 .../img/thumbnails/cropped/torch_text_logo.png   | Bin 0 -> 8545 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 _static/img/thumbnails/cropped/torch_text_logo.png

diff --git a/_static/img/thumbnails/cropped/torch_text_logo.png b/_static/img/thumbnails/cropped/torch_text_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..3fe736d60e282d080ac3165e3a5aad3c69bcc934
GIT binary patch
literal 8545
zcmeHtc~n!$mw!M34GpNYD1tzD2hxoo5kzDW1UC}4Hj6AG1X+}@8e}JcqSy)wz6gX}
zW$PCZ5M|%NEsbn#g2-Y-q9_=`A|MINRQ%4&`OTU0n}6oaobMm=IIrGIRo%K(x2o>_
zeCl!icl#4^$X!ST0wIUBK8i;mB%Q?fCTWn<;=>pLkB#Rn?JN<9hbgiv9#Y`9hNm^&
z4uOc$LLlO=AP_>375@o=2*n@}lVk+Kgn>Zp2+nCZX$A_U&)S?gidYx_-EA(RgB+QQ
zSi55~6B~b&S2$=*31lG<>gTXWEr}6B{88VK(-WCm?^IkL2c<tac#D59EXVFM*5r(#
zp@F(j?4{W9xWkTi#f|!}KBIDvUf6~XIDN}Yw>a#&${))!IlmlEM)^fz5l=!%xsuj5
zc=r%nYx9<jd$WPk|GWMR3sn9o{mNV-VyoYdo~3Y^##>M3GZO;XyXi}!+K|P9q*vw^
z<c_}{J0RTt)?iVfBk;s5exr_jv8A@P9Nj<kIfiAgC8>lty;0{QhYERO?KO=d<moQu
zYdU&nsz1`&hH!ItP1UrV9j@w-r8hghEKNgIXhx!0Z%`))oU)?naY-ear_=GF6{ge0
zRs*LXezoj`QDdzU-T$29)vj$nf6<djiVXdgHH?i-=QR@tP12+6Ee5hBLjsq!jYUBs
z>~x{kz$mH~a<!ovvu;k0XOB)G2pZ&l74n*q2CQY2h5cMNn>t3($qvRWtoshyUoA6t
zqNVY0kOO_?VPkVTl05a<YT${~&nYeGOr<ZofxhZHE=p!}-oo_3wApPxsd^2=NX{zp
zg-OP2YeG2P)s9pZh^l|PntAijFC8sgD;eh+^+-$a+wVKsQzPCl5}7wd%_t*6`y>u(
zMJ?Mw7Mjjobw;v-o{~!QWdfTE^(IL^sYu5z_`{qctbGcf#vF#@L(4^vOtCH2x+><@
z3sipDnS-r#C+#cE8f1x=wJeWDEa`ENmulN7dedzl9TjJ1*ilhbew5gK`Owc~2~`~%
zm^lDXFZx4E^q3r;BQ`bz>5>iOlqzL5>cl6U`;Ejh$>Ck3B(uhdMWx(jGNWNAn8xo|
z_HcFHWVhJ~A;axw|Nb_O=sgvz{kv{Sxqg$1!K1ssNyxr?r&1`avPT(bA_-HY&lBNi
z->$L7=c|a1-hB|uxpOzRLP!^`-Bl(h`P}nS>#I66=f3!OsY)mLl9~*YJf@FksNonD
z())#MsVJ$`cGejyRjLU8HPFo7eif^`W6>t@myoLOjU~Z?E|Yv0ZL^`~px?&j3c6XB
zGf-p+opJ(94mDYbs?3~SB&2-wdT{2T#UK0RXRN^3o^pD-Ofo1jtr@dk-%e7lbVgAW
zj=YoF<b)_gEuVT8Qguo{j~=RPi0+EbpOM_CgGeWIT|v`Tv4-^yJ*=TGT(`#HwUd2S
zuA3atWQo4hIK!4Wz3pYUE(%l%cX8F9?39VvE2)H_-zqwA&73o;UR<uF`XomuB2b)}
zPB>znQk*U=D9##@f$tjPbbp#=(51vJ&7cxcL8xUc6ACSij%&62+AJ>6{9ZsD7tOM%
z9{8Kn1;olBYonI-9G!2`pg{)%Yr9B`Y+eG9F2XTqX?JF&`#*ZMi#y;x@jDbQ?m9uD
zl&c}6FdbwWU~SBVejg8^LPY^@a|L_GyDm3yxiX*b4W}Lx_o7KCcR!ASJaa<-t%fk_
zCF}ADU;?)%*V!#s&*5#Z4J!w2p=nAgp|(~cBKCE`%FaPr%b0N*QnY7tg{HH6V{q(F
zN2RJZvCuQBw%4i-Cp+}l6d<I>$(NGFwn@J8{wh++v+e$C2zS%p`kBg63tRn4%l%2#
zpopYXa(UnyoEmy?T<(5FiB)=wj?%4oskTxRh-V5*e^XR;h<3jrci;JUFu)3mCNO(R
z2d6pt2&dBY06wJ3e@nO3mP*8V?^IqjYJy-VpodfiK9TO%*&-pk*z<|gx>432!&i*J
zmO!dD3*NdVrLLjZKyy1Kl^z@6>>jsd2Z~Z(3X(KM=lXW9=D_1{;%wxjJ50Cc+d~>c
zw`X>O0x%U(K#c^NY#D8e9e`r8T$GfH-S&?;3xh*ZEF-9ij?|vt)#C^?<$4*CI4-R5
zkSd`>QLx5lr@ptC?Uic`r1R<7pqmx7caZ96s3`-fbq(JYK$AM3r-L@mMLM3qlS&t+
zJy&ze7Et@pNeM6XXy=0%yi$}pdO3h8@XFh>^~(d6CeM=MJA<S6@_nFidX9~*N>72M
z_IX;X$XWDk$&%UOmvCMlB_TUBjBUomLOct6f|A0Cw)a!~Ldh2&h}P~lS5p)-x$$Yl
z^D7#pZj&sbgSEO&H|zc3ZIySMPf|XQ`#_}^ihk_%0GobonmW1sD2=kwB5UZD4|}xS
zXRSj>rNHxi;<2SbZ7(_ey{guZ;6c3|!ljNT#Y%*ajm4ky`0zNx?N-x;H3gA-=0%Os
zzg3Sc(~0j)axRRE7N#e{LVuF!Xj=P4>foP3<R>3(|71L;teZVImPyp~Q<H0$x47-H
z_e<PBp;e)=vnrUPwjUUqwiX1|2QzrF%(3Ryj>^{iGo$P$UnBWlE}whW8)3Dloau>O
zMPM5q*Gw`7%lTzFpQz{V&cbQOtn9De-Q6TzR#;v}s}(8Rdyk}4=S-eyGUY2?ar{YK
zHMW5FAbG<8W}#7ZQ9CE+MNALN1-KqM{ClWH_0hrg<l)a@R(;=k%(j66+Nz_p%5_Gt
z-ATgJON*4rrKMGrphT;+cf7`gRi`&~O*u%Bt#{o`5PRHHm0=2cA2Vp({d6*#UPBo3
zOl5BD_G`m2kxL%h=Xc`_zbX*gsRoitAz&9UPSS18g6JwJlzh%dxqCICBFsn&k$Tl5
z2U3m%WHgSez&A!5m4g_uo<nFEy(2(E&lG+)8_Cr3(<F`6(}!+-eDG56@x2Y!&0#Kq
zXt2(ORw9ScQvax0K>Rjnef2@1<N&6c9cD6biY`?*i7FF0%a>T~c)43bmMm}PVLxEc
zl$oL~t+iOjFm}bwM14G$-1(=eAR1<J)S;Df(VGKqV^bY$;5Fy{_86l5sy460LUA}!
z+(%7vKnauj8s9#dPhT<N_$>ziTQ3b1Jyo?PycTZqJBf8BX~Nxc_ZxFL&zAVa^}#e^
zE2pAyd_1`Vx>ElOn2pwx+R3&(tX+j!V;0sRB7aM8{##FWabmS=`da{#x^P!yP>``I
zjj;okZLiE@dMyyRwGB(VaeR@HpM;<3_WR{p%LK?BwVXj5Y_OP}Pzw<LFn<&ni{NlR
zhHGzl!`=Q2t@ch9y*i5$BdC3yS=>SmMYD?7_;(f1in(Fb`>>@6VNv^ST`J>*Sc$&v
z)jPRFhBJ;-wPpjU1Kjd?(NK<%R_ce1rSm4>XN$da5l0%C+e7TyBo8`;;V)4P#*Lka
z!SXTTe2u*y9o?ERaEhr{=B~6tg4Q#56>xC%{G9tPpQL!HV1%u!^PppFST9&}3my+j
zYB<IjOdtNW{wK>Zc}&+;t<dUsABdlDT9WzRmLGG0L~uayfj_2!J%1|67q+-d_)&Yr
zb0c7HPoUNP{i5d*Fr%u5!=Of#0qX&~aLN3|*VFx9RgS9v^%extka8hSDlT`jUWnqP
zS}2$Ae;h&vvnj*<mGoPKIE<LWKHmgI2mT0Ljp}pP{iZ>Jh2GG5HqtTw+(=(J_30%L
zXjNlYrTeqt8+d$IVm3?}$b8Jzh%&8KjnpO83u{s9=<#4prwez*VV|w|(QASJO0OjS
zV>&^?FExw4;NnKJKjmKEkiDxpcps;4t&x3YaWq8S6W}WmN#3T_mS4l3kqTw7Yx`I!
z^OF)&SQR6Oz_<;vmTi(wI%3tzGG!i=A<<{{@qV;^;ElMgr{ToK*dFmxa@bn#gaPv=
z$zae8m#1YSV-h}nU~hXL4mD*WPxlt&hb)N4M1tZR=G2u1y5F37Id4+-PPV^43>=u!
z>*H$qbWTuP(;Z4o+}n7=`U~^wgzN+BKOYG6xdgf=>*Y8e?H<{XjXbgny_;ok`8x=0
zoOVtD#h-Ac-9|dL+6IiQ<SRRzkNbMa@bRNxK|l-(+R~iEq>S3nb+U8Hiq7O47*9-g
zv8&#-7Itt-VBJWv2WU8Xjwg3N96$fi{)1q`E1*Q1_?i$>zILcu=bP`OcE{M`xL+As
zV3%4Rnvm{4f%ljk2HX7Rg+XjU$h<S<$Ic-A!1JxIXywL^I{PJL<FThB=cM~j!<C`G
za#Z6E86N$bPbk<KX7sqzN<7Swa=H5=V`n1*%`pK0)~qaHtSPBT=m&P#FgvGI)Ywv7
zD&>4$3^nv|x*K9|Bb`cN^9SSDz`}Hy^&F<_p+9D2XLbTbCC<GOgin^tsOp>{=?NTk
z8s0KsFo84yqA6w`37`1HKN@l-n|BqZD+GO<<%ewxDSx*W$W<8FE}RLt7!*{feR|dU
z+K$mjTjb5Y=i|C-T=FAKx0bzwf%WpvQ|p6Y>&_UayRF(F5EuX@5a(_G8;QU75E}8N
z0`Lb>V*)#_>J+4kqw28wr}_PldqXVcpf9M%RGvL_?Xz0aBlR;mi}ezEhubbcHMEv0
z9mn=)M(VLl;k#*U*+Zgpsb0pc<4`Z~8?2&DjhKv$I!>+$s8X!G*77MRUuzi)y~gWJ
zcKYVOyX=fRq_zaD=(94gEsRC1gx(3zCdbiM9t3|LAH{VbJqr7o%&#pX9-9q=Kof;(
zzmVlOJ5>$}nXG;>so-@J+lKNMS-Ol5#J0%|f_s~sFty(`Caa8!QbT`c-JTwgid%G4
zFShrIc_@J?S=k^htyr6YZpvS+7F|GmOf`0j$8A3h_yyvhEzzlIe<E@C7fB@zFNys%
z`g}!1vCkbZhL-_r6=g(s<-fJSRUP6hx?*gw^62`sD4hfNlnt0`;^7ytlfHEWrZYE>
za@R8E(EQI0q3g)fFqfqO0gi*)sKYodsiae$OKfwonZs-^kG(xBkkXqLmAV)2)~e4y
zwpi_b2^fshOW+3<!$sb0eAE=Y<04{LA5R|9$B38u=Sl{}G<(qfA4FWS)&`s2=sB~^
zJ*+>uVK{esXIO0k%2?F@^w8?ZMzzkh*(vqTqU)Y>4^{0{8+%3l^$sJ`%e@jZnekFd
z0miI*s1x)Yd#XN*qUGilc|R95j6DF2*icPb3BTn$*I0<yjiZ{e;OT??&CNrrnQJk%
zP+9mylpbpo6%LK`nqNKQ^rP6gmXqOa{bC{$1_fyvr(Y;acrJ&J(;J5GQjJ(?M9ohg
zi|Yb^&4q~Obe`+vtjGs4qBp&z`S;~=k83-P+E=CXTp&HV>vYRNC(2X|gTA`E4qnbc
zliS}<ZWwsTPMVd$#xj`kU%RZ$%H65M)+8G6MekcCr;Q9B{>VtqSUm;E8tG5_61Bz<
zn7nW*&w_l$;c91)R(%%Fhsg|W6@{pPb`=yYW!)0N;QgvDFGN8Ct>x!L!%{)-*acc!
z6a%RpXLI#-tCt}9bvoYWQVG3@9@W&6izH3pNoBE-GhW)Rc6+zV)BQirYIc|cwt0xe
z5}L7!Q0|3R854*mfV`E4sZb}pgCC^1j3M=ZiVlky!803{Fr;c1dnu^EcV}go;lq>$
zK{Se$NvsoI0PtxJP(ZZtaBe3)Z-i(5rtPiKV+BIWtdF)1ILuJM#&TCP8%LiX7ti>_
zy)3C-tk`w9U#iJrUR1Rw9TIY7uNBiS8~6iwx!XZ1|HGjbH(DKV7$R&>_;Urq_OjRb
zl;P5x&d<})s#)J6<%_gk+`aE_R*Nu~R7#DQjd7+Kp08M2)5?(II?P!>E43&=(Pa*c
zh>7K7CYHiQV@F0`o4ui}a>JhQFHgNvuSJ&V;3TQ=P2Aonrc2S0W)<jVdnnF&?oqOc
zC-iPp0=7$Dy>O7k+ND{aHpyw&?Wo)lBIXCL`%zq*IWA#rQU?C(Z@0B+b7->D24K!u
z?XR>fN}^-;<q7dgM&25-_YoXhx}L@)^F!-!l=>OlC0ZgpR=5Y&ew|>uv8-jX7?O%}
z0C8S7f%r=JhW%@zprm{{##Hp^=E=C(P8NI9AKk>hhRj{u;62+Lv@-NBodL%HZ>sJf
z9W8LWvazXILDk&oqw);`Q7!<>JFshv2Ih<#<w-)klmj3Jwybh2zqyF$DN2D>_8Z<9
zdvVA9VNex!vS*ynej>~zzR>5Z=UIt6PzoalM6mQyt$uSn$25zl0#k+_uxm^eJa_7v
zrNv8ORD^lOoyrdW;KZP2F*-81HjRDvl>%XWR|_scNzUw8sj@brDJ%AuiCD#nM{ub$
zy0$4ZJ#E^xC6_lkz0wrhX7DF4O5z7Le&jPdvAgt@`Mu3Ha4C{&tGit-C0?qRr@pDp
z-wAt_0i>6FeDz)wu|NIsx}o6n$ARE209`!t0Hpb?^;Je^X6}|(K?iBt+a*(;Ov#wh
zHLc>`RA#xT#TbyDW-X>A(1mxy%S?TIm$&&q>z5EfX|l?QgU^8@FDaDIRO3g4GYOdh
z0=`VN)UGS@X<FjyY%6<b{BqKQ9DnRFeP}5NU32e}c!j)7^neynOLVgr{`OcdHXz$G
zN%h+<fXw^{?Wt(PpH>0&>%P&FwODCd@!-$Oc7ysGTgV6ei)!xGr+evEbGc<#7$?JH
z!y7$A5_{x)Amn$+s~OeAzM-7O&31!W-P3Pv0~S{>>6^zg#Z!y1U2q@$s}8q+Png(+
z9RkOEW~9n@%Hpy<qk685R9Q5vVIZ3buT}2BwtVTalakG|Q~>RM+G8~Z7u9uP*9My7
zhF-m@Z441MnFbbF?U(`Ht<C?9trWw7-jue>To_iYUJ2-bIuFl0VJnsg3jhK*A?9M-
zcgBn%=?eWQuO<`$JPRmz#7zXqbPw=Hk(BE*fg5!q#2Bt7880#A0Mpjr+Rlv<UknN6
z>j1!x`_eOkXR<~~S7r>wUH)4pTO8~nEm%wVv@%6lvvalXFf3>k+Cbd!+SmB*>FbC7
zb>#ZE1&>jzVZ-`fRwi4OsX(APNAIpY4r_Bb|2RHar_3>*_4Br&XAT1Wi(jMHMr}zc
z2k<uq38)hMCBF*)aIWAub7JyrxO@iTo@|7?q!JmPcq$h*+ER6$()XudgrKBkU3Ys#
zI+SB)*w~uuveyg}9%@;+!HvChp|SxSVB{BTJLX@U2u3F+i8nj>p<LK*v(^vSJuA|`
zgyBgAD)8M9ood^q1eAJVx~xO-<pJQ5?3se8F2KE>W7kqC{_DNOzE^Ddf8edjTOQ&5
zCFw;`!>`QSl^@@kefE56^kHVRO2Zq+bm7U(!`PO;zTsLPpt!i3AqT6bLS%ZsWvvFN
zm%S#)eF2>G!P<ky<^!|ya$z*{nIAxqs#$v~DrJ+-o`afhGfy$tt4qT8>O2?RRLKLQ
z(d-KMy{?VHE$dmsFyA*(Stjo$5fO!E8R1h#AF|V~oxmqxMMa(FwgRuFAHQM7QzHyQ
zIq%VM>H?P8zS_=PM)W!Ww_Hu1<xnyJ&fEUj-fQLT%W-LC$4a&0UA=lml7G25h6(TG
zyBDd|`oZ2!wOY4=3MJ<<+2av)qLCJSeJ;|j7Eh{Ffi*%w&xdHM*_jWzi#?_{IaSsP
z?*>~eNH*q9wj1BM@HFB1LYG%(a>nMM5@E)oP*Cc=w<(|LbH;ALrujB68!qaoZ4_0A
zkCmKkl=U_Qh31P$OUsT>O#f(W8q!^5Yx!8(cW9|SFv?C6JZKVb_e~;Kg$ERvX33PD
zu?b0uDrqSF0lmj%l@sSk;FQv_w~9Nc>-F@mgm6O2LN{A2A-rKp!C-5-j$B8Vc(VDt
z*&SS&3x;F!pE`T&i-#))+`a44kamuQ*4LtbIhvx#%TwfKSZYzzxCFac0i>tHU`5&}
z*3#O}%G;VH(%OcjmMf&cTwNXPMIy)m+#(P+f#>?CNgmjWHpZ`3hmFO=h4<~Y!2-B3
zd8-Ij;G2(@_3Il2m>}$V7?jJNr`|!GvuHTTpTBZ$TP68|9HeSb{h6hLed{o14|$G0
z4+pVYr!dz0gFVzs2aco^r2zTcig@oL!Iadc1J~dW3DC>=*fl>W1P(R?&aNs|___#q
zCqq^*q4z1DXlN6fR<8k|(XPT+8CAQhWq?^630E)X+d_I!Q+m{`z2}-@HW@h3O#!Yh
z-{8aI&Hp`5um73cepaO@eS=8)EJJg3e*R>L=}TD&S;XCa|6IvVU}}W?E(zJ4cH)<;
zG~av##<1YJrzy(^x`s`}hBYj;$DX!3gjO%kqp!~Q$8BWn5pQsF*8o&~wwLZg^u0KX
zFSX|6QCq6s!icKRO1=a`R$Rk_i3>JjaZvf8OD9I3lUo-g)+x{wb{hQ#M|wyO6mPZT
z@Q(6mc*i3Pl(Pf1O2pW)zu3cgVWlyRM<>3B+OnPWF<A^B(c3Gh3z;NfdE@G%bf;S$
zCc;t|BWPs$mJB?>oIr&f0q&q?boUgzw&xcWh9-T-**4d$mE`uE)lX;R!sm{4$?iOS
z<Re)Eu^&Xs|FLABK>XbfF=*FzHcwa6*}m%U(H%7i#%A!1$sf%M3;%(hTuie<q_Wpn
zJk$aJwc;hAo<k$zB}>tcu<OJxt-fxUkHYX<QHbIrZD-wpg~cC^T5PG>tlMm8?6J1<
zD)gZlwS!<O8!v?dU({;w?Gy$N4^Z&L{C-iTG;rFyC}xLro8-1ihT|}Hx>$0+g-PB2
z=9BqN?NjQNiSP$Yn=1%e9BBO}zU#`cvP`LS-HqVB+~F|hU*d1f|9AcGSfDLOxSpKS
zJLmfYX%1*S>+OW~!D6)$o4}6^h;0%Z5E9@`eD9NxL~I35eQ^6;+xR{2D0mlNg!4fc
z!@d3eH86V(Fnjmy2mdO7ki5J3dz}xUe8d0qF0LQ3Y3pwA(7bre>7tj%MQ;;NsyBEb
z^fCGdx|sdC`WWKA{U-YiObqq+U@#^a%-y!)<^LcM5ai|S6Z!ua=+9L<1_ZW=HF!t(
z`j{L8iGz<%i1$UrfrEX$I3G~D^WRH<_l@xOcRK1D;2jv^8x)Ap*Ed$kLv8!db)5f0
p9q9H`FHlGA-`gPs`CJV5pnCr&i!>Oa--DV6>@oYJ<yPd&{{n1=PF?^2

literal 0
HcmV?d00001


From fdaa066c91b6dbeda56580a9dc86928bf5e046ea Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:45:19 +0530
Subject: [PATCH 13/31] added card in index.rst

---
 index.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/index.rst b/index.rst
index 48550e6c4ac..ecb64a1aa6e 100644
--- a/index.rst
+++ b/index.rst
@@ -264,6 +264,13 @@ What's new in PyTorch tutorials?
    :link: beginner/translation_transformer.html
    :tags: Text
 
+.. customcarditem::
+   :header: Preaparing custom text dataset using Torchtext
+   :card_description: Learn how to use torchtext to prepare a custom dataset
+   :image: _static/img/thumbnails/cropped/torch_text_logo.png
+   :link: beginner/torchtext_custom_dataset_tutorial.html
+   :tags: Text
+
 
 .. Reinforcement Learning
 

From 09dd73b4ae63bb4b3f77942b5453476fb5f50dc7 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 19:50:05 +0530
Subject: [PATCH 14/31] added entry in toctree

---
 index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/index.rst b/index.rst
index ecb64a1aa6e..d23ec3a701a 100644
--- a/index.rst
+++ b/index.rst
@@ -877,6 +877,7 @@ Additional Resources
    intermediate/seq2seq_translation_tutorial
    beginner/text_sentiment_ngrams_tutorial
    beginner/translation_transformer
+   beginner/torchtext_custom_dataset_tutorial
 
 
 .. toctree::

From f7619d73a870b7a25c2a63230a0477eb1978eff1 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 20:06:01 +0530
Subject: [PATCH 15/31] updated Makefile for downloading dataset

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index a01ea69bb50..fe5cd32e31e 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,10 @@ download:
 	wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR)
 	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
 
+	# Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py
+	wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR)
+	unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/
+
 
 docs:
 	make download

From 1e4163d91a906483212162d6fdc8e46c8da2d637 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 20:07:27 +0530
Subject: [PATCH 16/31] get dataset from data folder

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9e0a3ed1c6e..9281998054e 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -64,7 +64,7 @@
 # %%
 # Now we will load the dataset
 
-FILE_PATH = 'fin.txt'
+FILE_PATH = 'data/fin.txt'
 dataPipe = dp.iter.IterableWrapper([FILE_PATH])
 dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
 dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

From 1d96ae31d27a068a4720ed2abc1bbc3613c46c3a Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 22:18:09 +0530
Subject: [PATCH 17/31] updated comment

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 9281998054e..b9ae85fde93 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -9,8 +9,7 @@
 will be preparing a  custom dataset that can be further utilized to train a sequence-to-sequence
 model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
 with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\
-20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but using Torchtext 0.15.0 instead
-of a legacy version.
+20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version.
 
 In this tutorial, we will learn how to:
 

From af728f1893cb7d2bf0ba426b7a9f62922d9d5ced Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 22:57:11 +0530
Subject: [PATCH 18/31] updated function to view results, and added some
 sources

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index b9ae85fde93..6a4b6991e3b 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -350,7 +350,7 @@ def apply_padding(pair_of_sequences):
 sourceItoS = sourceVocab.get_itos()
 targetItoS = targetVocab.get_itos()
 
-def show_some_transformed_senetnces(data_pipe):
+def show_some_transformed_sentences(data_pipe):
     """
     Function to show how the senetnces look like after applying all transforms.
     Here we try to print actual words instead of corresponding index
@@ -368,6 +368,11 @@ def show_some_transformed_senetnces(data_pipe):
             print(f"Source: {source}")
             print(f"Traget: {target}")
         break
+
+show_some_transformed_sentences(dataPipe)
 # %%
-# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we can
-# use this dataPipe while writing our training function.
+# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we\
+# can use this dataPipe while writing our training function.
+#
+# Some parts of this tutorial was inspired from this article: `Click here: <https://medium.com/@b\
+# itdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__.

From 519ccbfe51723b2e64c57a123bae265154c66571 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Thu, 4 May 2023 23:09:42 +0530
Subject: [PATCH 19/31] updated typo

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 6a4b6991e3b..aeb1ef86de4 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -374,5 +374,5 @@ def show_some_transformed_sentences(data_pipe):
 # In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we\
 # can use this dataPipe while writing our training function.
 #
-# Some parts of this tutorial was inspired from this article: `Click here: <https://medium.com/@b\
+# Some parts of this tutorial was inspired from this article: `Click here <https://medium.com/@b\
 # itdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__.

From 332356be56c643a7ef1821f10e3fa5ef9e09826e Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:11:53 +0530
Subject: [PATCH 20/31] fixed hyperlinks

---
 .../torchtext_custom_dataset_tutorial.py      | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index aeb1ef86de4..d22a41eb303 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -9,7 +9,8 @@
 will be preparing a  custom dataset that can be further utilized to train a sequence-to-sequence
 model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
 with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\
-20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version.
+20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version
+of torchtext.
 
 In this tutorial, we will learn how to:
 
@@ -21,16 +22,18 @@
 Let us assume that we need to prepare a dataset to train a model that can perform English to
 Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
 the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from this link: `Click
-Here <https://www.manythings.org/anki/fin-eng.zip>`__
+Here <https://www.manythings.org/anki/fin-eng.zip>`__.
+
+Sentence pairs for other languages can be found in this link:
+
+Link: `https://www.manythings.org/anki/ <https://www.manythings.org/anki/>`__
 """
 
 # %%
-# Setup
-# -----
+#Setup
+#-----
 #
 # First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
-# The dataset can be downloaded from this link: `Click Here <https://www.manythings.org/anki/fin\
-# -eng.zip>`__ .
 #
 # Ensure that following packages are installed:
 #
@@ -79,8 +82,9 @@
 #    of the tab-delimited file
 #
 # Data pipes can be thought of something like a dataset object, on which
-# we can perform various operations. Check `this tutorial <https://pytorch.org\
-# /data/beta/dp_tutorial.html>`_ for more details on data pipes.
+# we can perform various operations.
+# Check `this tutorial <https://pytorch.org/data/beta/dp_tutorial.html>`_ for more details on
+# data pipes.
 #
 # We can verify if the iterable has the pair of sentences as shown
 # below:
@@ -371,8 +375,9 @@ def show_some_transformed_sentences(data_pipe):
 
 show_some_transformed_sentences(dataPipe)
 # %%
-# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we\
+# In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we
 # can use this dataPipe while writing our training function.
 #
-# Some parts of this tutorial was inspired from this article: `Click here <https://medium.com/@b\
-# itdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__.
+# Some parts of this tutorial was inspired from this article:
+# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
+#  <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 >`__.

From 44848ff67d565f4ab5869ea30ea0e656b257e05f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:22:00 +0530
Subject: [PATCH 21/31] changed title and introduction

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index d22a41eb303..f4e5350cbd9 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-
 """
-Preaparing custom text dataset using Torchtext
+Pre-process custom text dataset using Torchtext
 ==============================================
 
 **Author**: `Anupam Sharma <https://anp-scp.github.io/>`_
 
-This tutorial is regarding the preparation of a text dataset using Torchtext. In the tutorial, we
-will be preparing a  custom dataset that can be further utilized to train a sequence-to-sequence
+This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial,
+we will pre-process a dataset that can be further utilized to train a sequence-to-sequence
 model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
 with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\
 20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version

From e007c60402c0e91dd1014b2bdb7e30bd027d493f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 01:46:14 +0530
Subject: [PATCH 22/31] fixed indentation issue

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index f4e5350cbd9..e66333d8a15 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -30,8 +30,8 @@
 """
 
 # %%
-#Setup
-#-----
+# Setup
+# -----
 #
 # First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
 #

From 66dd97295781d5f911430f886258147cda3754f4 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 02:11:28 +0530
Subject: [PATCH 23/31] fixed typo

---
 .../torchtext_custom_dataset_tutorial.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index e66333d8a15..20bea700745 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -241,10 +241,10 @@ def get_transform(vocab):
 # the transform:
 
 tempList = list(dataPipe)
-someSetence = tempList[798][0]
+someSentence = tempList[798][0]
 print("Some sentence=", end="")
-print(someSetence)
-transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSetence))
+print(someSentence)
+transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence))
 print("Transformed sentence=", end="")
 print(transformedSentence)
 indexToString = sourceVocab.get_itos()
@@ -254,7 +254,7 @@ def get_transform(vocab):
 # %%
 # In the above code,:
 #
-#   * At line 2, we take a source setence from list that we created from dataPipe at line 1
+#   * At line 2, we take a source sentence from list that we created from dataPipe at line 1
 #   * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
 #     sentence. Note that transforms take list of words and not a sentence.
 #   * At line 8, we get the mapping of index to string and then use it get the transformed
@@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe):
 # In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we
 # can use this dataPipe while writing our training function.
 #
-# Some parts of this tutorial was inspired from this article:
-# `Link https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
-#  <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 >`__.
+# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0
+# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
+# <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 >`__.

From 190e4a1fadda8ff3d22922c8cc1bdf11f4642207 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 02:27:45 +0530
Subject: [PATCH 24/31] fixed typo

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 20bea700745..d6e6cdcf3d5 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -378,6 +378,6 @@ def show_some_transformed_sentences(data_pipe):
 # In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we
 # can use this dataPipe while writing our training function.
 #
-# Some parts of this tutorial was inspired from this article: Torchtext DataLoaders in version 0.14.0
-# `Link: https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 \
-# <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71 >`__.
+# Some parts of this tutorial was inspired from this article:
+# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\
+# <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__.

From ba5efff2f15c235633f39ea86d06561944dcec1b Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:26:57 +0530
Subject: [PATCH 25/31] replaced Finninsh with German as spacy German model is
 already there in build

---
 .../torchtext_custom_dataset_tutorial.py      | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index d6e6cdcf3d5..ad749d9fefb 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -20,9 +20,9 @@
 * Perform bucket batching
 
 Let us assume that we need to prepare a dataset to train a model that can perform English to
-Finnish translation. We will use a tab-delimited Finnish - English sentence pairs provided by
+German translation. We will use a tab-delimited German - English sentence pairs provided by
 the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from this link: `Click
-Here <https://www.manythings.org/anki/fin-eng.zip>`__.
+Here <https://www.manythings.org/anki/deu-eng.zip>`__.
 
 Sentence pairs for other languages can be found in this link:
 
@@ -33,7 +33,7 @@
 # Setup
 # -----
 #
-# First, download the dataset, extract the zip, and note the path to the file `fin.txt`.
+# First, download the dataset, extract the zip, and note the path to the file `deu.txt`.
 #
 # Ensure that following packages are installed:
 #
@@ -47,10 +47,10 @@
 # convert a sentence to list of words. Spacy is a python package used for various Natural
 # Language Processing (NLP) tasks.
 #
-# Download the English and Finnish models from spacy as shown below: ::
+# Download the English and German models from spacy as shown below: ::
 #
 #   python -m spacy download en_core_web_sm
-#   python -m spacy download fi_core_news_sm
+#   python -m spacy download de_core_news_sm
 
 
 # %%
@@ -61,12 +61,12 @@
 import spacy
 from torchtext.vocab import build_vocab_from_iterator
 eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
-fin = spacy.load("fi_core_news_sm") # Load the Finnish model to be used for tokenizing
+de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing
 
 # %%
 # Now we will load the dataset
 
-FILE_PATH = 'data/fin.txt'
+FILE_PATH = 'data/deu.txt'
 dataPipe = dp.iter.IterableWrapper([FILE_PATH])
 dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
 dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
@@ -123,23 +123,23 @@ def eng_tokenize(text):
     """
     return [token.text for token in eng.tokenizer(text)]
 
-def fin_tokenize(text):
+def de_tokenize(text):
     """
-    Tokenize a Finnish text and returns list of tokens
+    Tokenize a German text and returns list of tokens
     """
-    return [token.text for token in fin.tokenizer(text)]
+    return [token.text for token in de.tokenizer(text)]
 
 # %%
 # Above function accepts a text and returns a list of words
 # as shown below:
 
 print(eng_tokenize("Have a good day!!!"))
-print(fin_tokenize("Hyvää päivänjatkoa!!!"))
+print(de_tokenize("Haben Sie einen guten Tag!!!"))
 
 # %%
 # Building the vocabulary
 # -----------------------
-# Let us consider an English sentence as the source and a Finnish sentence as the target.
+# Let us consider an English sentence as the source and a German sentence as the target.
 #
 # Vocabulary can be considered as the set of unique words we have in the dataset.
 # We will build vocabulary for both our source and target now.
@@ -153,11 +153,11 @@ def get_tokens(data_iter, place):
     tuple of sentences (source and target), `place` parameters defines for which
     index to return the tokens for. `place=0` for source and `place=1` for target
     """
-    for english, finnish in data_iter:
+    for english, german in data_iter:
         if place == 0:
             yield eng_tokenize(english)
         else:
-            yield fin_tokenize(finnish)
+            yield de_tokenize(german)
 
 # %%
 # Now, we will build vocabulary for source:
@@ -270,7 +270,7 @@ def apply_transform(sequence_pair):
 
     return (
         get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
-        get_transform(targetVocab)(fin_tokenize(sequence_pair[1]))
+        get_transform(targetVocab)(de_tokenize(sequence_pair[1]))
     )
 dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
 tempList = list(dataPipe)

From 8b105b6f905361ff7024ed1462e060e3571311f1 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:30:18 +0530
Subject: [PATCH 26/31] fixed issue in title

---
 beginner_source/torchtext_custom_dataset_tutorial.py | 2 +-
 index.rst                                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index ad749d9fefb..e4ec3b48100 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Pre-process custom text dataset using Torchtext
-==============================================
+===============================================
 
 **Author**: `Anupam Sharma <https://anp-scp.github.io/>`_
 
diff --git a/index.rst b/index.rst
index d23ec3a701a..6e6d687d0c1 100644
--- a/index.rst
+++ b/index.rst
@@ -265,7 +265,7 @@ What's new in PyTorch tutorials?
    :tags: Text
 
 .. customcarditem::
-   :header: Preaparing custom text dataset using Torchtext
+   :header: Pre-process custom text dataset using Torchtext 
    :card_description: Learn how to use torchtext to prepare a custom dataset
    :image: _static/img/thumbnails/cropped/torch_text_logo.png
    :link: beginner/torchtext_custom_dataset_tutorial.html

From 409b29d046dc904d2e5f1905e8f071276e8250c2 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Fri, 5 May 2023 03:31:38 +0530
Subject: [PATCH 27/31] use another dataset

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index fe5cd32e31e..ed0ade00465 100644
--- a/Makefile
+++ b/Makefile
@@ -107,8 +107,8 @@ download:
 	tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/
 
 	# Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py
-	wget -nv -N https://www.manythings.org/anki/fin-eng.zip -P $(DATADIR)
-	unzip -o $(DATADIR)/fin-eng.zip -d beginner_source/data/
+	wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR)
+	unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/
 
 
 docs:

From 8925bcba4e9c9282ecf7eae320e519f8a4b8f32f Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Tue, 16 May 2023 18:43:05 +0530
Subject: [PATCH 28/31] addressed review comments for PR #2307

---
 .../torchtext_custom_dataset_tutorial.py      | 164 +++++++++---------
 1 file changed, 81 insertions(+), 83 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index e4ec3b48100..1439615050b 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -21,12 +21,11 @@
 
 Let us assume that we need to prepare a dataset to train a model that can perform English to
 German translation. We will use a tab-delimited German - English sentence pairs provided by
-the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from this link: `Click
-Here <https://www.manythings.org/anki/deu-eng.zip>`__.
+the `Tatoeba Project <https://tatoeba.org/en>`_ which can be downloaded from
+`this link <https://www.manythings.org/anki/deu-eng.zip>`__.
 
-Sentence pairs for other languages can be found in this link:
-
-Link: `https://www.manythings.org/anki/ <https://www.manythings.org/anki/>`__
+Sentence pairs for other languages can be found in `this link <https://www.manythings.org/anki/>`\
+__.
 """
 
 # %%
@@ -37,11 +36,11 @@
 #
 # Ensure that following packages are installed:
 #
-# * `Torchdata 0.6.0 <https://pytorch.org/data/beta/index.html>`_ (Installation instructions: `C\
-#   lick here <https://github.com/pytorch/data>`__)
-# * `Torchtext 0.15.0 <https://pytorch.org/text/stable/index.html>`_ (Installation instructions:\
-#   `Click here <https://github.com/pytorch/text>`__)
-# * Spacy (Docs: `Click here <https://spacy.io/usage>`__)
+# * `Torchdata 0.6.0 <https://pytorch.org/data/beta/index.html>`_ (`Installation instructions \
+#   <https://github.com/pytorch/data>`__)
+# * `Torchtext 0.15.0 <https://pytorch.org/text/stable/index.html>`_ (`Installation instructions \
+#   <https://github.com/pytorch/text>`__)
+# * `Spacy <https://spacy.io/usage>`__
 #
 # Here, we are using `Spacy` to tokenize text. In simple words tokenization means to
 # convert a sentence to list of words. Spacy is a python package used for various Natural
@@ -67,9 +66,9 @@
 # Now we will load the dataset
 
 FILE_PATH = 'data/deu.txt'
-dataPipe = dp.iter.IterableWrapper([FILE_PATH])
-dataPipe = dp.iter.FileOpener(dataPipe, mode='rb')
-dataPipe = dataPipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
+data_pipe = dp.iter.IterableWrapper([FILE_PATH])
+data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
+data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
 
 # %%
 # In the above code block, we are doing following things:
@@ -81,15 +80,15 @@
 #    again returns an iterable of tuples representing each rows
 #    of the tab-delimited file
 #
-# Data pipes can be thought of something like a dataset object, on which
+# DataPipes can be thought of something like a dataset object, on which
 # we can perform various operations.
 # Check `this tutorial <https://pytorch.org/data/beta/dp_tutorial.html>`_ for more details on
-# data pipes.
+# DataPipes.
 #
 # We can verify if the iterable has the pair of sentences as shown
 # below:
 
-for sample in dataPipe:
+for sample in data_pipe:
     print(sample)
     break
 
@@ -97,35 +96,35 @@
 # Note that we also have attribution details along with pair of sentences. We will
 # write a small function to remove the attribution details:
 
-def remove_attribution(row):
+def removeAttribution(row):
     """
     Function to keep the first two elements in a tuple
     """
     return row[:2]
-dataPipe = dataPipe.map(remove_attribution)
+data_pipe = data_pipe.map(removeAttribution)
 
 # %%
-# The `map` function at line 2 in above code block can be used to apply some function
-# on each elements of data pipe. Now, we can verify that the data pipe only contains
+# The `map` function at line 6 in above code block can be used to apply some function
+# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains
 # pair of sentences.
 
 
-for sample in dataPipe:
+for sample in data_pipe:
     print(sample)
     break
 
 # %%
 # Now, let us define few functions to perform tokenization:
 
-def eng_tokenize(text):
+def engTokenize(text):
     """
-    Tokenize an English text and returns list of tokens
+    Tokenize an English text and return a list of tokens
     """
     return [token.text for token in eng.tokenizer(text)]
 
-def de_tokenize(text):
+def deTokenize(text):
     """
-    Tokenize a German text and returns list of tokens
+    Tokenize a German text and return a list of tokens
     """
     return [token.text for token in de.tokenizer(text)]
 
@@ -133,8 +132,8 @@ def de_tokenize(text):
 # Above function accepts a text and returns a list of words
 # as shown below:
 
-print(eng_tokenize("Have a good day!!!"))
-print(de_tokenize("Haben Sie einen guten Tag!!!"))
+print(engTokenize("Have a good day!!!"))
+print(deTokenize("Haben Sie einen guten Tag!!!"))
 
 # %%
 # Building the vocabulary
@@ -145,9 +144,9 @@ def de_tokenize(text):
 # We will build vocabulary for both our source and target now.
 #
 # Let us define a function to get tokens from elements of tuples in the iterator.
-# The comments within the function specifies the need and working of it:
 
-def get_tokens(data_iter, place):
+
+def getTokens(data_iter, place):
     """
     Function to yield tokens from an iterator. Since, our iterator contains
     tuple of sentences (source and target), `place` parameters defines for which
@@ -155,25 +154,25 @@ def get_tokens(data_iter, place):
     """
     for english, german in data_iter:
         if place == 0:
-            yield eng_tokenize(english)
+            yield engTokenize(english)
         else:
-            yield de_tokenize(german)
+            yield deTokenize(german)
 
 # %%
 # Now, we will build vocabulary for source:
 
-sourceVocab = build_vocab_from_iterator(
-    get_tokens(dataPipe,0),
+source_vocab = build_vocab_from_iterator(
+    getTokens(data_pipe,0),
     min_freq=2,
     specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
     special_first=True
 )
-sourceVocab.set_default_index(sourceVocab['<unk>'])
+source_vocab.set_default_index(source_vocab['<unk>'])
 
 # %%
 # The code above, builds the vocabulary from the iterator. In the above code block:
 #
-# * At line 2, we call the `get_tokens()` function with `place=0` as we need vocabulary for
+# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for
 #   source sentences.
 # * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs
 #   less than 2 times.
@@ -194,23 +193,23 @@ def get_tokens(data_iter, place):
 #
 # Similarly, we will build vocabulary for target sentences:
 
-targetVocab = build_vocab_from_iterator(
-    get_tokens(dataPipe,1),
+target_vocab = build_vocab_from_iterator(
+    getTokens(data_pipe,1),
     min_freq=2,
     specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
     special_first=True
 )
-targetVocab.set_default_index(targetVocab['<unk>'])
+target_vocab.set_default_index(target_vocab['<unk>'])
 
 # %%
 # Note that the example above shows how can we add special tokens to our vocabulary. The
 # special tokens may change based on the requirements.
 #
 # Now, we can verify that special tokens are placed at the beginning and then other words.
-# In the below code, `sourceVocab.get_itos()` returns a list with tokens at index based on
+# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on
 # vocabulary.
 
-print(sourceVocab.get_itos()[:9])
+print(source_vocab.get_itos()[:9])
 
 # %%
 # Numericalize sentences using vocabulary
@@ -218,7 +217,7 @@ def get_tokens(data_iter, place):
 # After building the vocabulary, we need to convert our sentences to corresponding indices.
 # Let us define some functions for this:
 
-def get_transform(vocab):
+def getTransform(vocab):
     """
     Create transforms based on given vocabulary. The returned transform is applied to sequence
     of tokens.
@@ -237,55 +236,55 @@ def get_transform(vocab):
 
 # %%
 # Now, let us see how to use the above function. The function returns an object of `Transforms`
-# which we will use on our sentence. Let us take a random sentence and check the working of
-# the transform:
+# which we will use on our sentence. Let us take a random sentence and check how the transform
+# works.
 
-tempList = list(dataPipe)
-someSentence = tempList[798][0]
+temp_list = list(data_pipe)
+some_sentence = temp_list[798][0]
 print("Some sentence=", end="")
-print(someSentence)
-transformedSentence = get_transform(sourceVocab)(eng_tokenize(someSentence))
+print(some_sentence)
+transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
 print("Transformed sentence=", end="")
-print(transformedSentence)
-indexToString = sourceVocab.get_itos()
-for index in transformedSentence:
-    print(indexToString[index], end=" ")
+print(transformed_sentence)
+index_to_string = source_vocab.get_itos()
+for index in transformed_sentence:
+    print(index_to_string[index], end=" ")
 
 # %%
 # In the above code,:
 #
-#   * At line 2, we take a source sentence from list that we created from dataPipe at line 1
-#   * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
-#     sentence. Note that transforms take list of words and not a sentence.
-#   * At line 8, we get the mapping of index to string and then use it get the transformed
-#     sentence
+# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1
+# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized
+#   sentence. Note that transforms take list of words and not a sentence.
+# * At line 8, we get the mapping of index to string and then use it get the transformed
+#   sentence
 #
-# Now we will use functions of `dataPipe` to apply transform to all our sentences.
+# Now we will use DataPipe functions to apply transform to all our sentences.
 # Let us define some more functions for this.
 
-def apply_transform(sequence_pair):
+def applyTransform(sequence_pair):
     """
     Apply transforms to sequence of tokens in a sequence pair
     """
 
     return (
-        get_transform(sourceVocab)(eng_tokenize(sequence_pair[0])),
-        get_transform(targetVocab)(de_tokenize(sequence_pair[1]))
+        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
+        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
     )
-dataPipe = dataPipe.map(apply_transform) ## Apply the function to each element in the iterator
-tempList = list(dataPipe)
-print(tempList[0])
+data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
+temp_list = list(data_pipe)
+print(temp_list[0])
 
 # %%
 # Make batches (with bucket batch)
 # --------------------------------
 # Generally, we train models in batches. While working for sequence to sequence models, it is
 # recommended to keep the length of sequences in a batch similar. For that we will use
-# `bucketbatch` function of `dataPipe`.
+# `bucketbatch` function of `data_pipe`.
 #
 # Let us define some functions that will be used by the `bucketbatch` function.
 
-def sort_bucket(bucket):
+def sortBucket(bucket):
     """
     Function to sort a given bucket. Here, we want to sort based on the length of
     source and target sequence.
@@ -295,9 +294,9 @@ def sort_bucket(bucket):
 # %%
 # Now, we will apply the `bucketbatch` function:
 
-dataPipe = dataPipe.bucketbatch(
+data_pipe = data_pipe.bucketbatch(
     batch_size = 4, batch_num=5,  bucket_num=1,
-    use_in_batch_shuffle=False, sort_key=sort_bucket
+    use_in_batch_shuffle=False, sort_key=sortBucket
 )
 
 # %%
@@ -310,14 +309,14 @@ def sort_bucket(bucket):
 #
 # Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`.
 # Generally, while training a model, we predict on a batch of `X` and compare the result with `y`.
-# But, a batch in our `dataPipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
+# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`:
 
-print(list(dataPipe)[0])
+print(list(data_pipe)[0])
 # %%
 # So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`.
 # For this we will write a small function:
 
-def separate_source_target(sequence_pairs):
+def separateSourceTarget(sequence_pairs):
     """
     input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
     output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
@@ -326,8 +325,8 @@ def separate_source_target(sequence_pairs):
     return sources,targets
 
 ## Apply the function to each element in the iterator
-dataPipe = dataPipe.map(separate_source_target)
-print(list(dataPipe)[0])
+data_pipe = data_pipe.map(separateSourceTarget)
+print(list(data_pipe)[0])
 
 # %%
 # Now, we have the data as desired.
@@ -337,7 +336,7 @@ def separate_source_target(sequence_pairs):
 # As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to
 # make all the sequences in a batch of equal length. We can perform padding as follows:
 
-def apply_padding(pair_of_sequences):
+def applyPadding(pair_of_sequences):
     """
     Convert sequnces to tensors and apply padding
     """
@@ -345,16 +344,16 @@ def apply_padding(pair_of_sequences):
 ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
 # padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
 # vocabulary.
-dataPipe = dataPipe.map(apply_padding)
+data_pipe = data_pipe.map(applyPadding)
 
 # %%
 # Now, we can use the index to string mapping to see how the sequence would look with tokens
 # instead of indices:
 
-sourceItoS = sourceVocab.get_itos()
-targetItoS = targetVocab.get_itos()
+source_index_to_string = source_vocab.get_itos()
+target_index_to_string = target_vocab.get_itos()
 
-def show_some_transformed_sentences(data_pipe):
+def showSomeTransformedSentences(data_pipe):
     """
     Function to show how the senetnces look like after applying all transforms.
     Here we try to print actual words instead of corresponding index
@@ -365,19 +364,18 @@ def show_some_transformed_sentences(data_pipe):
         for i in range(4):
             source = ""
             for token in sources[i]:
-                source += " " + sourceItoS[token]
+                source += " " + source_index_to_string[token]
             target = ""
             for token in targets[i]:
-                target += " " + targetItoS[token]
+                target += " " + target_index_to_string[token]
             print(f"Source: {source}")
             print(f"Traget: {target}")
         break
 
-show_some_transformed_sentences(dataPipe)
+showSomeTransformedSentences(data_pipe)
 # %%
 # In the above output we can observe that the shorter sentences are padded with `<pad>`. Now, we
-# can use this dataPipe while writing our training function.
+# can use `data_pipe` while writing our training function.
 #
-# Some parts of this tutorial was inspired from this article:
-# Link: `https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71\
+# Some parts of this tutorial was inspired from `this article
 # <https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71>`__.

From 9f71e3fa81569c001c2284600e8295c2cfdd9848 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Tue, 16 May 2023 22:45:13 +0530
Subject: [PATCH 29/31] corrected spelling mistakes

---
 .../torchtext_custom_dataset_tutorial.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 1439615050b..05d3c0b8625 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-
 """
-Pre-process custom text dataset using Torchtext
+Preprocess custom text dataset using Torchtext
 ===============================================
 
 **Author**: `Anupam Sharma <https://anp-scp.github.io/>`_
 
 This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial,
-we will pre-process a dataset that can be further utilized to train a sequence-to-sequence
+we will preprocess a dataset that can be further utilized to train a sequence-to-sequence
 model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning
 with Neural Networks <https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%\
 20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb>`_) but without using legacy version
@@ -46,7 +46,7 @@
 # convert a sentence to list of words. Spacy is a python package used for various Natural
 # Language Processing (NLP) tasks.
 #
-# Download the English and German models from spacy as shown below: ::
+# Download the English and German models from Spacy as shown below: ::
 #
 #   python -m spacy download en_core_web_sm
 #   python -m spacy download de_core_news_sm
@@ -179,7 +179,7 @@ def getTokens(data_iter, place):
 # * At line 4, we specify some special tokens:
 #
 #   * `<sos>` for start of sentence
-#   * `<eos>` for end of senetence
+#   * `<eos>` for end of sentence
 #   * `<unk>` for unknown words. An example of unknown word is the one skipped because of
 #     `min_freq=2`.
 #   * `<pad>` is the padding token. While training, a model we mostly train in batches. In a
@@ -189,7 +189,7 @@ def getTokens(data_iter, place):
 # * At line 5, we set `special_first=True`. Which means `<pad>` will get index 0, `<sos>` index 1,
 #   `<eos>` index 2, and <unk> will get index 3 in the vocabulary.
 # * At line 7, we set default index as index of `<unk>`. That means if some word is not in
-#   vocbulary, we will use `<unk>` instead of that unknown word.
+#   vocabulary, we will use `<unk>` instead of that unknown word.
 #
 # Similarly, we will build vocabulary for target sentences:
 
@@ -338,7 +338,7 @@ def separateSourceTarget(sequence_pairs):
 
 def applyPadding(pair_of_sequences):
     """
-    Convert sequnces to tensors and apply padding
+    Convert sequences to tensors and apply padding
     """
     return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
 ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
@@ -355,7 +355,7 @@ def applyPadding(pair_of_sequences):
 
 def showSomeTransformedSentences(data_pipe):
     """
-    Function to show how the senetnces look like after applying all transforms.
+    Function to show how the sentences look like after applying all transforms.
     Here we try to print actual words instead of corresponding index
     """
     for sources,targets in data_pipe:

From 12b644072d9f720123ae27639f8eb9adc3cbbee5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Wed, 17 May 2023 00:57:40 +0530
Subject: [PATCH 30/31] followed pyspelling's configuration for the shell
 commands

---
 .../torchtext_custom_dataset_tutorial.py            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py
index 05d3c0b8625..9875d8aa43a 100644
--- a/beginner_source/torchtext_custom_dataset_tutorial.py
+++ b/beginner_source/torchtext_custom_dataset_tutorial.py
@@ -46,10 +46,13 @@
 # convert a sentence to list of words. Spacy is a python package used for various Natural
 # Language Processing (NLP) tasks.
 #
-# Download the English and German models from Spacy as shown below: ::
+# Download the English and German models from Spacy as shown below:
+#
+# .. code-block:: shell
+#
+#    python -m spacy download en_core_web_sm
+#    python -m spacy download de_core_news_sm
 #
-#   python -m spacy download en_core_web_sm
-#   python -m spacy download de_core_news_sm
 
 
 # %%
@@ -59,8 +62,8 @@
 import torchtext.transforms as T
 import spacy
 from torchtext.vocab import build_vocab_from_iterator
-eng = spacy.load("en_core_web_sm") # Load the English model to be used for tokenizing
-de = spacy.load("de_core_news_sm") # Load the German model to be used for tokenizing
+eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
+de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text
 
 # %%
 # Now we will load the dataset

From f48bc259ccc54adc7413ec985066606a62cb66e5 Mon Sep 17 00:00:00 2001
From: anp-scp <29808870+anp-scp@users.noreply.github.com>
Date: Wed, 17 May 2023 00:59:24 +0530
Subject: [PATCH 31/31] added words used in
 beginner_source/torchtext_custom_dataset_tutorial.py

---
 en-wordlist.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/en-wordlist.txt b/en-wordlist.txt
index fdf5df67d8d..20b63dadc2d 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -30,6 +30,8 @@ DDP
 DDQN
 DNN
 DQN
+DataPipe
+DataPipes
 DataLoaders
 DeepMind
 DeiT
@@ -126,6 +128,7 @@ SciPy
 Sequentials
 Sigmoid
 SoTA
+Spacy
 TPU
 TensorBoard
 TextVQA
@@ -345,6 +348,7 @@ timestep
 timesteps
 tokenization
 tokenize
+tokenized
 tokenizer
 tokenizes
 tooltip