Skip to content

set eval mode before running evaluation #2484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 27, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 23 additions & 21 deletions intermediate_source/seq2seq_translation_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,20 +441,20 @@ def forward_step(self, input, hidden):
# :alt:
#
#
# Bahdanau attention, also known as additive attention, is a commonly used
# attention mechanism in sequence-to-sequence models, particularly in neural
# machine translation tasks. It was introduced by Bahdanau et al. in their
# paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
# This attention mechanism employs a learned alignment model to compute attention
# scores between the encoder and decoder hidden states. It utilizes a feed-forward
# Bahdanau attention, also known as additive attention, is a commonly used
# attention mechanism in sequence-to-sequence models, particularly in neural
# machine translation tasks. It was introduced by Bahdanau et al. in their
# paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
# This attention mechanism employs a learned alignment model to compute attention
# scores between the encoder and decoder hidden states. It utilizes a feed-forward
# neural network to calculate alignment scores.
#
# However, there are alternative attention mechanisms available, such as Luong attention,
# which computes attention scores by taking the dot product between the decoder hidden
# state and the encoder hidden states. It does not involve the non-linear transformation
# However, there are alternative attention mechanisms available, such as Luong attention,
# which computes attention scores by taking the dot product between the decoder hidden
# state and the encoder hidden states. It does not involve the non-linear transformation
# used in Bahdanau attention.
#
# In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
# In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
# exercise to explore modifying the attention mechanism to use Luong attention.

class BahdanauAttention(nn.Module):
Expand All @@ -467,7 +467,7 @@ def __init__(self, hidden_size):
def forward(self, query, keys):
scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
scores = scores.squeeze(2).unsqueeze(1)

weights = F.softmax(scores, dim=-1)
context = torch.bmm(weights, keys)

Expand Down Expand Up @@ -605,9 +605,9 @@ def get_dataloader(batch_size):
# ``teacher_forcing_ratio`` up to use more of it.
#

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
decoder_optimizer, criterion):

total_loss = 0
for data in dataloader:
input_tensor, target_tensor = data
Expand All @@ -617,7 +617,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,

encoder_outputs, encoder_hidden = encoder(input_tensor)
decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

loss = criterion(
decoder_outputs.view(-1, decoder_outputs.size(-1)),
target_tensor.view(-1)
Expand All @@ -628,7 +628,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
decoder_optimizer.step()

total_loss += loss.item()

return total_loss / len(dataloader)


Expand Down Expand Up @@ -671,7 +671,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
Expand All @@ -680,7 +680,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
print_loss_total += loss
plot_loss_total += loss

if epoch % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
Expand All @@ -691,7 +691,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0

showPlot(plot_losses)

######################################################################
Expand Down Expand Up @@ -736,7 +736,7 @@ def evaluate(encoder, decoder, sentence, input_lang, output_lang):

_, topi = decoder_outputs.topk(1)
decoded_ids = topi.squeeze()

decoded_words = []
for idx in decoded_ids:
if idx.item() == EOS_token:
Expand Down Expand Up @@ -793,7 +793,9 @@ def evaluateRandomly(encoder, decoder, n=10):

######################################################################
#

# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)


Expand All @@ -807,7 +809,7 @@ def evaluateRandomly(encoder, decoder, n=10):
# at each time step.
#
# You could simply run ``plt.matshow(attentions)`` to see attention output
# displayed as a matrix. For a better viewing experience we will do the
# displayed as a matrix. For a better viewing experience we will do the
# extra work of adding axes and labels:
#

Expand Down