43
43
from torchtext .data .utils import get_tokenizer
44
44
from collections import Counter
45
45
from torchtext .vocab import Vocab
46
- from torchtext .utils import download_from_url , extract_archive
46
+ from torchtext .utils import download_from_url , extract_archive , unicode_csv_reader
47
47
import io
48
48
49
49
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
@@ -69,13 +69,14 @@ def build_vocab(filepath, tokenizer):
69
69
en_vocab = build_vocab (train_filepaths [1 ], en_tokenizer )
70
70
71
71
def data_process (filepaths ):
72
- raw_de_iter , raw_en_iter = iter (io .open (filepaths [0 ])), iter (io .open (filepaths [1 ]))
72
+ raw_de_iter = iter (unicode_csv_reader (io .open (filepaths [0 ])))
73
+ raw_en_iter = iter (unicode_csv_reader (io .open (filepaths [1 ])))
73
74
data = []
74
75
for (raw_de , raw_en ) in zip (raw_de_iter , raw_en_iter ):
75
- de_tensor_ = torch .tensor ([de_vocab [token ] for token in de_tokenizer (raw_de )],
76
- dtype = torch .long )
77
- en_tensor_ = torch .tensor ([en_vocab [token ] for token in en_tokenizer (raw_en )],
78
- dtype = torch .long )
76
+ de_tensor_ = torch .tensor ([de_vocab [token ] for token in de_tokenizer (" " . join ( raw_de ) )],
77
+ dtype = torch .long )
78
+ en_tensor_ = torch .tensor ([en_vocab [token ] for token in en_tokenizer (" " . join ( raw_en ) )],
79
+ dtype = torch .long )
79
80
data .append ((de_tensor_ , en_tensor_ ))
80
81
return data
81
82
0 commit comments