Skip to content

Commit 0d504ea

Browse files
author
Guanheng Zhang
committed
Fix ascii decode error
1 parent 44ed7e0 commit 0d504ea

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

beginner_source/torchtext_translation_tutorial.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from torchtext.vocab import Vocab
4646
from torchtext.utils import download_from_url, extract_archive
4747
import io
48+
from torchtext.utils import unicode_csv_reader
4849

4950
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
5051
train_urls = ('train.de.gz', 'train.en.gz')
@@ -61,8 +62,8 @@
6162
def build_vocab(filepath, tokenizer):
6263
counter = Counter()
6364
with io.open(filepath, encoding="utf8") as f:
64-
for string_ in f:
65-
counter.update(tokenizer(string_))
65+
for string_ in unicode_csv_reader(f):
66+
counter.update(tokenizer(" ".join(string_)))
6667
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
6768

6869
de_vocab = build_vocab(train_filepaths[0], de_tokenizer)

0 commit comments

Comments
 (0)