From 51ff05a9c2449313cb6c950cfadc83bd1748706a Mon Sep 17 00:00:00 2001 From: noqqaqq Date: Mon, 12 Jun 2023 14:35:26 +0000 Subject: [PATCH 1/4] Mention prerequisites for running tutorial basing on observations made with issue 1993 --- .../text_sentiment_ngrams_tutorial.py | 118 +++++++++++------- 1 file changed, 70 insertions(+), 48 deletions(-) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py index 9036cdd7214..4ae53cbf890 100644 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ b/beginner_source/text_sentiment_ngrams_tutorial.py @@ -7,6 +7,14 @@ - Access to the raw data as an iterator - Build data processing pipeline to convert the raw text strings into ``torch.Tensor`` that can be used to train the model - Shuffle and iterate the data with `torch.utils.data.DataLoader `__ + + +Prerequisites: + - | Recent 2.x version of portalocker package needs to be installed prior to running the tutorial. + | E.g., in Colab environment this can be done by adding following line at the top of the script: + | `!pip install -U portalocker>=2.0.0` + | (More details https://github.com/pytorch/tutorials/issues/1993) + """ @@ -16,12 +24,13 @@ # # The torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the ``AG_NEWS`` dataset iterators yield the raw data as a tuple of label and text. # -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. +# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. # import torch from torchtext.datasets import AG_NEWS -train_iter = iter(AG_NEWS(split='train')) + +train_iter = iter(AG_NEWS(split="train")) ###################################################################### # :: @@ -60,13 +69,15 @@ from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator -tokenizer = get_tokenizer('basic_english') -train_iter = AG_NEWS(split='train') +tokenizer = get_tokenizer("basic_english") +train_iter = AG_NEWS(split="train") + def yield_tokens(data_iter): for _, text in data_iter: yield tokenizer(text) + vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[""]) vocab.set_default_index(vocab[""]) @@ -96,7 +107,6 @@ def yield_tokens(data_iter): # - ###################################################################### # Generate data batch and iterator # -------------------------------- @@ -111,22 +121,27 @@ def yield_tokens(data_iter): from torch.utils.data import DataLoader + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + def collate_batch(batch): label_list, text_list, offsets = [], [], [0] - for (_label, _text) in batch: - label_list.append(label_pipeline(_label)) - processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) - text_list.append(processed_text) - offsets.append(processed_text.size(0)) + for _label, _text in batch: + label_list.append(label_pipeline(_label)) + processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) + text_list.append(processed_text) + offsets.append(processed_text.size(0)) label_list = torch.tensor(label_list, dtype=torch.int64) offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) text_list = torch.cat(text_list) return label_list.to(device), text_list.to(device), offsets.to(device) -train_iter = AG_NEWS(split='train') -dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch) + +train_iter = AG_NEWS(split="train") +dataloader = DataLoader( + train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch +) ###################################################################### @@ -144,8 +159,8 @@ def collate_batch(batch): from torch import nn -class TextClassificationModel(nn.Module): +class TextClassificationModel(nn.Module): def __init__(self, vocab_size, embed_dim, num_class): super(TextClassificationModel, self).__init__() self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False) @@ -179,7 +194,7 @@ def forward(self, text, offsets): # We build a model with the embedding dimension of 64. The vocab size is equal to the length of the vocabulary instance. The number of classes is equal to the number of labels, # -train_iter = AG_NEWS(split='train') +train_iter = AG_NEWS(split="train") num_class = len(set([label for (label, text) in train_iter])) vocab_size = len(vocab) emsize = 64 @@ -194,6 +209,7 @@ def forward(self, text, offsets): import time + def train(dataloader): model.train() total_acc, total_count = 0, 0 @@ -211,12 +227,16 @@ def train(dataloader): total_count += label.size(0) if idx % log_interval == 0 and idx > 0: elapsed = time.time() - start_time - print('| epoch {:3d} | {:5d}/{:5d} batches ' - '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader), - total_acc/total_count)) + print( + "| epoch {:3d} | {:5d}/{:5d} batches " + "| accuracy {:8.3f}".format( + epoch, idx, len(dataloader), total_acc / total_count + ) + ) total_acc, total_count = 0, 0 start_time = time.time() + def evaluate(dataloader): model.eval() total_acc, total_count = 0, 0 @@ -227,7 +247,7 @@ def evaluate(dataloader): loss = criterion(predicted_label, label) total_acc += (predicted_label.argmax(1) == label).sum().item() total_count += label.size(0) - return total_acc/total_count + return total_acc / total_count ###################################################################### @@ -253,10 +273,11 @@ def evaluate(dataloader): from torch.utils.data.dataset import random_split from torchtext.data.functional import to_map_style_dataset + # Hyperparameters -EPOCHS = 10 # epoch +EPOCHS = 10 # epoch LR = 5 # learning rate -BATCH_SIZE = 64 # batch size for training +BATCH_SIZE = 64 # batch size for training criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR) @@ -266,31 +287,36 @@ def evaluate(dataloader): train_dataset = to_map_style_dataset(train_iter) test_dataset = to_map_style_dataset(test_iter) num_train = int(len(train_dataset) * 0.95) -split_train_, split_valid_ = \ - random_split(train_dataset, [num_train, len(train_dataset) - num_train]) - -train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, - shuffle=True, collate_fn=collate_batch) -valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, - shuffle=True, collate_fn=collate_batch) -test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, - shuffle=True, collate_fn=collate_batch) +split_train_, split_valid_ = random_split( + train_dataset, [num_train, len(train_dataset) - num_train] +) + +train_dataloader = DataLoader( + split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch +) +valid_dataloader = DataLoader( + split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch +) +test_dataloader = DataLoader( + test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch +) for epoch in range(1, EPOCHS + 1): epoch_start_time = time.time() train(train_dataloader) accu_val = evaluate(valid_dataloader) if total_accu is not None and total_accu > accu_val: - scheduler.step() + scheduler.step() else: - total_accu = accu_val - print('-' * 59) - print('| end of epoch {:3d} | time: {:5.2f}s | ' - 'valid accuracy {:8.3f} '.format(epoch, - time.time() - epoch_start_time, - accu_val)) - print('-' * 59) - + total_accu = accu_val + print("-" * 59) + print( + "| end of epoch {:3d} | time: {:5.2f}s | " + "valid accuracy {:8.3f} ".format( + epoch, time.time() - epoch_start_time, accu_val + ) + ) + print("-" * 59) ###################################################################### @@ -299,15 +325,12 @@ def evaluate(dataloader): # - ###################################################################### # Checking the results of the test dataset… -print('Checking the results of test dataset.') +print("Checking the results of test dataset.") accu_test = evaluate(test_dataloader) -print('test accuracy {:8.3f}'.format(accu_test)) - - +print("test accuracy {:8.3f}".format(accu_test)) ###################################################################### @@ -318,10 +341,8 @@ def evaluate(dataloader): # -ag_news_label = {1: "World", - 2: "Sports", - 3: "Business", - 4: "Sci/Tec"} +ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} + def predict(text, text_pipeline): with torch.no_grad(): @@ -329,6 +350,7 @@ def predict(text, text_pipeline): output = model(text, torch.tensor([0])) return output.argmax(1).item() + 1 + ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \ enduring the season’s worst weather conditions on Sunday at The \ Open on his way to a closing 75 at Royal Portrush, which \ @@ -343,4 +365,4 @@ def predict(text, text_pipeline): model = model.to("cpu") -print("This is a %s news" %ag_news_label[predict(ex_text_str, text_pipeline)]) +print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)]) From aa95851ead9cc0c3606d5cf748d8fe88c83e805d Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Mon, 12 Jun 2023 11:47:53 -0700 Subject: [PATCH 2/4] Apply suggestions from code review --- beginner_source/text_sentiment_ngrams_tutorial.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py index 4ae53cbf890..4eff360b5b1 100644 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ b/beginner_source/text_sentiment_ngrams_tutorial.py @@ -9,11 +9,13 @@ - Shuffle and iterate the data with `torch.utils.data.DataLoader `__ -Prerequisites: - - | Recent 2.x version of portalocker package needs to be installed prior to running the tutorial. - | E.g., in Colab environment this can be done by adding following line at the top of the script: - | `!pip install -U portalocker>=2.0.0` - | (More details https://github.com/pytorch/tutorials/issues/1993) +Prerequisites +~~~~~~~~~~~~~~~~ + - A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. + For example, in the Colab environment, this can be done by adding the following line at the top of the script: + .. code-block:: bash + + !pip install -U portalocker>=2.0.0` """ From fc0a02331452ad2bea1b7a444f9e48e326eeb7a2 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Mon, 12 Jun 2023 11:49:32 -0700 Subject: [PATCH 3/4] Apply suggestions from code review --- beginner_source/text_sentiment_ngrams_tutorial.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py index 4eff360b5b1..ffac8ca1b86 100644 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ b/beginner_source/text_sentiment_ngrams_tutorial.py @@ -11,11 +11,13 @@ Prerequisites ~~~~~~~~~~~~~~~~ - - A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. - For example, in the Colab environment, this can be done by adding the following line at the top of the script: - .. code-block:: bash + +- A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. +For example, in the Colab environment, this can be done by adding the following line at the top of the script: + +.. code-block:: bash - !pip install -U portalocker>=2.0.0` + !pip install -U portalocker>=2.0.0` """ From 8603eeec1f300ef8bfb9eb1b4c1ba588129f64dc Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Mon, 12 Jun 2023 12:41:25 -0700 Subject: [PATCH 4/4] Update beginner_source/text_sentiment_ngrams_tutorial.py --- beginner_source/text_sentiment_ngrams_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py index ffac8ca1b86..021befdb972 100644 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ b/beginner_source/text_sentiment_ngrams_tutorial.py @@ -12,7 +12,7 @@ Prerequisites ~~~~~~~~~~~~~~~~ -- A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. +A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. For example, in the Colab environment, this can be done by adding the following line at the top of the script: .. code-block:: bash