Skip to content

Commit 077c290

Browse files
committed
BUG: made encoding optional on csv read/write, addresses #717
1 parent b50af20 commit 077c290

File tree

4 files changed

+52
-18
lines changed

4 files changed

+52
-18
lines changed

pandas/core/common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -831,11 +831,11 @@ def console_encode(value):
831831
except (AttributeError, TypeError):
832832
return value.encode('ascii', 'replace')
833833

834-
def csv_encode(value):
834+
def csv_encode(value, encoding='UTF-8'):
835835
if py3compat.PY3 or not isinstance(value, unicode):
836836
return value
837837

838-
return value.encode('UTF-8', 'replace')
838+
return value.encode(encoding, 'replace')
839839

840840
class UTF8Recoder:
841841
"""

pandas/core/frame.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,8 @@ def to_panel(self):
834834
to_wide = deprecate('to_wide', to_panel)
835835

836836
def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
837-
index=True, index_label=None, mode='w', nanRep=None):
837+
index=True, index_label=None, mode='w', nanRep=None,
838+
encoding=None):
838839
"""
839840
Write DataFrame to a comma-separated values (csv) file
840841
@@ -891,11 +892,23 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
891892
# given a string for a DF with Index
892893
index_label = [index_label]
893894

894-
encoded_labels = [csv_encode(val) for val in index_label]
895-
encoded_cols = [csv_encode(val) for val in cols]
895+
if encoding is not None:
896+
encoded_labels = [csv_encode(val, encoding=encoding)
897+
for val in index_label]
898+
encoded_cols = [csv_encode(val, encoding=encoding)
899+
for val in cols]
900+
else:
901+
encoded_labels = list(index_label)
902+
encoded_cols = list(cols)
903+
896904
csvout.writerow(encoded_labels + encoded_cols)
897905
else:
898-
encoded_cols = [csv_encode(val) for val in cols]
906+
if encoding is not None:
907+
encoded_cols = [csv_encode(val, encoding=encoding)
908+
for val in cols]
909+
else:
910+
encoded_cols = list(cols)
911+
899912
csvout.writerow(encoded_cols)
900913

901914
nlevels = getattr(self.index, 'nlevels', 1)
@@ -913,7 +926,12 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
913926

914927
row_fields.append(val)
915928

916-
encoded_rows = [csv_encode(val) for val in row_fields]
929+
if encoding is not None:
930+
encoded_rows = [csv_encode(val, encoding=encoding)
931+
for val in row_fields]
932+
else:
933+
encoded_rows = list(row_fields)
934+
917935
csvout.writerow(encoded_rows)
918936

919937
f.close()

pandas/io/parsers.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@
8686
def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
8787
skiprows=None, na_values=None, parse_dates=False,
8888
date_parser=None, nrows=None, iterator=False, chunksize=None,
89-
skip_footer=0, converters=None, verbose=False, delimiter=None):
89+
skip_footer=0, converters=None, verbose=False, delimiter=None,
90+
encoding=None):
9091
if hasattr(filepath_or_buffer, 'read'):
9192
f = filepath_or_buffer
9293
else:
@@ -111,7 +112,8 @@ def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
111112
chunksize=chunksize,
112113
skip_footer=skip_footer,
113114
converters=converters,
114-
verbose=verbose)
115+
verbose=verbose,
116+
encoding=encoding)
115117

116118
if nrows is not None:
117119
return parser.get_chunk(nrows)
@@ -124,14 +126,15 @@ def read_csv(filepath_or_buffer, sep=',', header=0, index_col=None, names=None,
124126
def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
125127
names=None, skiprows=None, na_values=None, parse_dates=False,
126128
date_parser=None, nrows=None, iterator=False, chunksize=None,
127-
skip_footer=0, converters=None, verbose=False, delimiter=None):
129+
skip_footer=0, converters=None, verbose=False, delimiter=None,
130+
encoding=None):
128131
return read_csv(filepath_or_buffer, sep=sep, header=header,
129132
skiprows=skiprows, index_col=index_col,
130133
na_values=na_values, date_parser=date_parser,
131134
names=names, parse_dates=parse_dates,
132135
nrows=nrows, iterator=iterator, chunksize=chunksize,
133136
skip_footer=skip_footer, converters=converters,
134-
verbose=verbose, delimiter=delimiter)
137+
verbose=verbose, delimiter=delimiter, encoding=None)
135138

136139
def read_clipboard(**kwargs): # pragma: no cover
137140
"""
@@ -194,7 +197,8 @@ class TextParser(object):
194197
def __init__(self, f, delimiter=None, names=None, header=0,
195198
index_col=None, na_values=None, parse_dates=False,
196199
date_parser=None, chunksize=None, skiprows=None,
197-
skip_footer=0, converters=None, verbose=False):
200+
skip_footer=0, converters=None, verbose=False,
201+
encoding=None):
198202
"""
199203
Workhorse function for processing nested list into DataFrame
200204
@@ -210,6 +214,8 @@ def __init__(self, f, delimiter=None, names=None, header=0,
210214
self.date_parser = date_parser
211215
self.chunksize = chunksize
212216
self.passed_names = names is not None
217+
self.encoding = encoding
218+
213219

214220
if com.is_integer(skiprows):
215221
skiprows = range(skiprows)
@@ -261,9 +267,20 @@ def _make_reader(self, f):
261267
self.pos += 1
262268
sniffed = csv.Sniffer().sniff(line)
263269
dia.delimiter = sniffed.delimiter
264-
self.buf.extend(list(com.UnicodeReader(StringIO(line),
265-
dialect=dia)))
266-
reader = com.UnicodeReader(f, dialect=dia)
270+
if self.encoding is not None:
271+
self.buf.extend(list(
272+
com.UnicodeReader(StringIO(line),
273+
dialect=dia,
274+
encoding=self.encoding)))
275+
else:
276+
self.buf.extend(list(csv.reader(StringIO(line),
277+
dialect=dia)))
278+
279+
if self.encoding is not None:
280+
reader = com.UnicodeReader(f, dialect=dia,
281+
encoding=self.encoding)
282+
else:
283+
reader = csv.reader(f, dialect=dia)
267284
else:
268285
reader = (re.split(sep, line.strip()) for line in f)
269286

pandas/tests/test_frame.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
assert_frame_equal)
2525

2626
import pandas.util.testing as tm
27-
from pandas.util import py3compat
2827
import pandas._tseries as lib
2928

3029
#-------------------------------------------------------------------------------
@@ -2486,8 +2485,8 @@ def test_to_csv_unicode(self):
24862485
from pandas import read_csv
24872486
path = '__tmp__.csv'
24882487
df = DataFrame({u'c/\u03c3':[1,2,3]})
2489-
df.to_csv(path)
2490-
df2 = read_csv(path, index_col=0)
2488+
df.to_csv(path, encoding='UTF-8')
2489+
df2 = read_csv(path, index_col=0, encoding='UTF-8')
24912490
assert_frame_equal(df, df2)
24922491
os.remove(path)
24932492

0 commit comments

Comments
 (0)