Skip to content

Commit e3df4e2

Browse files
committed
DOC: added info on encoding parameter for csv i/o
1 parent db2114a commit e3df4e2

File tree

6 files changed

+62
-17
lines changed

6 files changed

+62
-17
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,8 @@ Release notes
11881188
* `timeRule` argument in `shift` has been deprecated in favor of using the
11891189
`offset` argument for everything. So you can still pass a time rule string
11901190
to `offset`
1191+
* Added optional `encoding` argument to `read_csv`, `read_table`, `to_csv`,
1192+
`from_csv` to handle unicode in python 2.x
11911193

11921194
**Bug fixes**
11931195

doc/source/io.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ data into a DataFrame object. They can take a number of arguments:
9393
- ``skip_footer``: number of lines to skip at bottom of file (default 0)
9494
- ``converters``: a dictionary of functions for converting values in certain
9595
columns, where keys are either integers or column labels
96+
- ``encoding``: a string representing the encoding to use if the contents are
97+
non-ascii, for python versions prior to 3
9698

9799
.. ipython:: python
98100
:suppress:
@@ -277,6 +279,8 @@ function takes a number of arguments. Only the first is required.
277279
used. (A sequence should be given if the DataFrame uses MultiIndex).
278280
- ``mode`` : Python write mode, default 'w'
279281
- ``sep`` : Field delimiter for the output file (default "'")
282+
- ``encoding``: a string representing the encoding to use if the contents are
283+
non-ascii, for python versions prior to 3
280284

281285
Writing a formatted string
282286
~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v0.7.0.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ New features
9494
- ``value_range`` added as utility function to get min and max of a dataframe
9595
(GH288_)
9696

97+
- Added ``encoding`` argument to ``read_csv``, ``read_table``, ``to_csv`` and
98+
``from_csv`` for non-ascii text (GH717_)
99+
97100
API Changes to integer indexing
98101
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
99102

@@ -299,6 +302,7 @@ similar operation to the above but using a Python function:
299302
.. _GH595: https://github.com/wesm/pandas/issues/595
300303
.. _GH647: https://github.com/wesm/pandas/issues/647
301304
.. _GH699: https://github.com/wesm/pandas/issues/699
305+
.. _GH717: https://github.com/wesm/pandas/issues/717
302306
.. _GH93: https://github.com/wesm/pandas/issues/93
303307
.. _GH93: https://github.com/wesm/pandas/issues/93
304308
.. _PR521: https://github.com/wesm/pandas/pull/521

pandas/core/frame.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -744,7 +744,7 @@ def from_items(cls, items, columns=None, orient='columns'):
744744

745745
@classmethod
746746
def from_csv(cls, path, header=0, sep=',', index_col=0,
747-
parse_dates=True):
747+
parse_dates=True, encoding=None):
748748
"""
749749
Read delimited file into DataFrame
750750
@@ -773,7 +773,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
773773
"""
774774
from pandas.io.parsers import read_table
775775
return read_table(path, header=header, sep=sep,
776-
parse_dates=parse_dates, index_col=index_col)
776+
parse_dates=parse_dates, index_col=index_col,
777+
encoding=encoding)
777778

778779
def to_sparse(self, fill_value=None, kind='block'):
779780
"""
@@ -834,7 +835,7 @@ def to_panel(self):
834835
to_wide = deprecate('to_wide', to_panel)
835836

836837
def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
837-
index=True, index_label=None, mode='w', nanRep=None,
838+
index=True, index_label=None, mode='w', nanRep=None,
838839
encoding=None):
839840
"""
840841
Write DataFrame to a comma-separated values (csv) file
@@ -858,6 +859,9 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
858859
mode : Python write mode, default 'w'
859860
sep : character, default ","
860861
Field delimiter for the output file.
862+
encoding : string, optional
863+
a string representing the encoding to use if the contents are
864+
non-ascii, for python versions prior to 3
861865
"""
862866
f = open(path, mode)
863867
csvout = csv.writer(f, lineterminator='\n', delimiter=sep)

pandas/core/series.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1979,7 +1979,8 @@ def hist(self, ax=None, grid=True, **kwds):
19791979
return ax
19801980

19811981
@classmethod
1982-
def from_csv(cls, path, sep=',', parse_dates=True):
1982+
def from_csv(cls, path, sep=',', parse_dates=True, header=None,
1983+
index_col=0, encoding=None):
19831984
"""
19841985
Read delimited file into Series
19851986
@@ -1990,30 +1991,56 @@ def from_csv(cls, path, sep=',', parse_dates=True):
19901991
Field delimiter
19911992
parse_dates : boolean, default True
19921993
Parse dates. Different default from read_table
1994+
header : int, default 0
1995+
Row to use at header (skip prior rows)
1996+
index_col : int or sequence, default 0
1997+
Column to use for index. If a sequence is given, a MultiIndex
1998+
is used. Different default from read_table
1999+
encoding : string, optional
2000+
a string representing the encoding to use if the contents are
2001+
non-ascii, for python versions prior to 3
19932002
19942003
Returns
19952004
-------
19962005
y : Series
19972006
"""
19982007
from pandas.core.frame import DataFrame
1999-
df = DataFrame.from_csv(path, header=None, sep=sep, parse_dates=parse_dates)
2000-
return df[df.columns[0]]
2008+
df = DataFrame.from_csv(path, header=header, index_col=index_col,
2009+
sep=sep, parse_dates=parse_dates,
2010+
encoding=encoding)
2011+
return df.ix[:, 0]
20012012

2002-
def to_csv(self, path, index=True):
2013+
def to_csv(self, path, index=True, sep=",", na_rep='', header=False,
2014+
index_label=None, mode='w', nanRep=None, encoding=None):
20032015
"""
2004-
Write the Series to a CSV file
2016+
Write Series to a comma-separated values (csv) file
20052017
20062018
Parameters
20072019
----------
2008-
path : string or None
2009-
Output filepath. If None, write to stdout
2010-
index : bool, optional
2011-
Include the index as row names or not
2012-
"""
2013-
f = open(path, 'w')
2014-
csvout = csv.writer(f, lineterminator='\n')
2015-
csvout.writerows(self.iteritems(index))
2016-
f.close()
2020+
path : string
2021+
File path
2022+
nanRep : string, default ''
2023+
Missing data rep'n
2024+
header : boolean, default False
2025+
Write out series name
2026+
index : boolean, default True
2027+
Write row names (index)
2028+
index_label : string or sequence, default None
2029+
Column label for index column(s) if desired. If None is given, and
2030+
`header` and `index` are True, then the index names are used. A
2031+
sequence should be given if the DataFrame uses MultiIndex.
2032+
mode : Python write mode, default 'w'
2033+
sep : character, default ","
2034+
Field delimiter for the output file.
2035+
encoding : string, optional
2036+
a string representing the encoding to use if the contents are
2037+
non-ascii, for python versions prior to 3
2038+
"""
2039+
from pandas.core.frame import DataFrame
2040+
df = DataFrame(self)
2041+
df.to_csv(path, index=index, sep=sep, na_rep=na_rep, header=header,
2042+
index_label=index_label,mode=mode, nanRep=nanRep,
2043+
encoding=encoding)
20172044

20182045
def dropna(self):
20192046
"""

pandas/io/parsers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
Indicate number of NA values placed in non-numeric columns
5252
delimiter : string, default None
5353
Alternative argument name for sep
54+
encoding : string, default None
55+
Encoding to use for UTF when reading/writing (ex. 'utf-8')
5456
5557
Returns
5658
-------
@@ -185,6 +187,8 @@ class TextParser(object):
185187
Row numbers to skip
186188
skip_footer : int
187189
Number of line at bottom of file to skip
190+
encoding : string, default None
191+
Encoding to use for UTF when reading/writing (ex. 'utf-8')
188192
"""
189193

190194
# common NA values

0 commit comments

Comments
 (0)