Skip to content

Commit 8c817f1

Browse files
author
Guanheng Zhang
committed
Fix ascii decode error
1 parent 44ed7e0 commit 8c817f1

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

beginner_source/torchtext_translation_tutorial.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from torchtext.data.utils import get_tokenizer
4444
from collections import Counter
4545
from torchtext.vocab import Vocab
46-
from torchtext.utils import download_from_url, extract_archive
46+
from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader
4747
import io
4848

4949
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
@@ -69,13 +69,14 @@ def build_vocab(filepath, tokenizer):
6969
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
7070

7171
def data_process(filepaths):
72-
raw_de_iter, raw_en_iter = iter(io.open(filepaths[0])), iter(io.open(filepaths[1]))
72+
raw_de_iter = iter(unicode_csv_reader(io.open(filepaths[0])))
73+
raw_en_iter = iter(unicode_csv_reader(io.open(filepaths[1])))
7374
data = []
7475
for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
75-
de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
76-
dtype=torch.long)
77-
en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
78-
dtype=torch.long)
76+
de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(" ".join(raw_de))],
77+
dtype=torch.long)
78+
en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(" ".join(raw_en))],
79+
dtype=torch.long)
7980
data.append((de_tensor_, en_tensor_))
8081
return data
8182

0 commit comments

Comments
 (0)