DOC: added info on encoding parameter for csv i/o

adamklein · adamklein · commit e3df4e2ea341 · 2012-01-31T11:25:58.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -1188,6 +1188,8 @@ Release notes
   * `timeRule` argument in `shift` has been deprecated in favor of using the
     `offset` argument for everything. So you can still pass a time rule string
     to `offset`
+  * Added optional `encoding` argument to `read_csv`, `read_table`, `to_csv`,
+    `from_csv` to handle unicode in python 2.x
 
 **Bug fixes**
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -93,6 +93,8 @@ data into a DataFrame object. They can take a number of arguments:
   - ``skip_footer``: number of lines to skip at bottom of file (default 0)
   - ``converters``: a dictionary of functions for converting values in certain
     columns, where keys are either integers or column labels
+  - ``encoding``: a string representing the encoding to use if the contents are
+    non-ascii, for python versions prior to 3 
 
 .. ipython:: python
    :suppress:
@@ -277,6 +279,8 @@ function takes a number of arguments. Only the first is required.
     used. (A sequence should be given if the DataFrame uses MultiIndex).
   - ``mode`` : Python write mode, default 'w'
   - ``sep`` : Field delimiter for the output file (default "'")
+  - ``encoding``: a string representing the encoding to use if the contents are
+    non-ascii, for python versions prior to 3 
 
 Writing a formatted string
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.7.0.txt b/doc/source/whatsnew/v0.7.0.txt
@@ -94,6 +94,9 @@ New features
 - ``value_range`` added as utility function to get min and max of a dataframe
   (GH288_)
 
+- Added ``encoding`` argument to ``read_csv``, ``read_table``, ``to_csv`` and
+  ``from_csv`` for non-ascii text (GH717_)
+
 API Changes to integer indexing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -299,6 +302,7 @@ similar operation to the above but using a Python function:
 .. _GH595: https://github.com/wesm/pandas/issues/595
 .. _GH647: https://github.com/wesm/pandas/issues/647
 .. _GH699: https://github.com/wesm/pandas/issues/699
+.. _GH717: https://github.com/wesm/pandas/issues/717
 .. _GH93: https://github.com/wesm/pandas/issues/93
 .. _GH93: https://github.com/wesm/pandas/issues/93
 .. _PR521: https://github.com/wesm/pandas/pull/521
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -744,7 +744,7 @@ def from_items(cls, items, columns=None, orient='columns'):
 
     @classmethod
     def from_csv(cls, path, header=0, sep=',', index_col=0,
-                 parse_dates=True):
+                 parse_dates=True, encoding=None):
         """
         Read delimited file into DataFrame
 
@@ -773,7 +773,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         """
         from pandas.io.parsers import read_table
         return read_table(path, header=header, sep=sep,
-                          parse_dates=parse_dates, index_col=index_col)
+                          parse_dates=parse_dates, index_col=index_col,
+                          encoding=encoding)
 
     def to_sparse(self, fill_value=None, kind='block'):
         """
@@ -834,7 +835,7 @@ def to_panel(self):
     to_wide = deprecate('to_wide', to_panel)
 
     def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
-              index=True, index_label=None, mode='w', nanRep=None, 
+              index=True, index_label=None, mode='w', nanRep=None,
               encoding=None):
         """
         Write DataFrame to a comma-separated values (csv) file
@@ -858,6 +859,9 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
         mode : Python write mode, default 'w'
         sep : character, default ","
             Field delimiter for the output file.
+        encoding : string, optional
+            a string representing the encoding to use if the contents are
+            non-ascii, for python versions prior to 3
         """
         f = open(path, mode)
         csvout = csv.writer(f, lineterminator='\n', delimiter=sep)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1979,7 +1979,8 @@ def hist(self, ax=None, grid=True, **kwds):
         return ax
 
     @classmethod
-    def from_csv(cls, path, sep=',', parse_dates=True):
+    def from_csv(cls, path, sep=',', parse_dates=True, header=None,
+                 index_col=0, encoding=None):
         """
         Read delimited file into Series
 
@@ -1990,30 +1991,56 @@ def from_csv(cls, path, sep=',', parse_dates=True):
             Field delimiter
         parse_dates : boolean, default True
             Parse dates. Different default from read_table
+        header : int, default 0
+            Row to use at header (skip prior rows)
+        index_col : int or sequence, default 0
+            Column to use for index. If a sequence is given, a MultiIndex
+            is used. Different default from read_table
+        encoding : string, optional
+            a string representing the encoding to use if the contents are
+            non-ascii, for python versions prior to 3
 
         Returns
         -------
         y : Series
         """
         from pandas.core.frame import DataFrame
-        df = DataFrame.from_csv(path, header=None, sep=sep, parse_dates=parse_dates)
-        return df[df.columns[0]]
+        df = DataFrame.from_csv(path, header=header, index_col=index_col,
+                                sep=sep, parse_dates=parse_dates,
+                                encoding=encoding)
+        return df.ix[:, 0]
 
-    def to_csv(self, path, index=True):
+    def to_csv(self, path, index=True, sep=",", na_rep='', header=False,
+               index_label=None, mode='w', nanRep=None, encoding=None):
         """
-        Write the Series to a CSV file
+        Write Series to a comma-separated values (csv) file
 
         Parameters
         ----------
-        path : string or None
-            Output filepath. If None, write to stdout
-        index : bool, optional
-            Include the index as row names or not
-        """
-        f = open(path, 'w')
-        csvout = csv.writer(f, lineterminator='\n')
-        csvout.writerows(self.iteritems(index))
-        f.close()
+        path : string
+            File path
+        nanRep : string, default ''
+            Missing data rep'n
+        header : boolean, default False
+            Write out series name
+        index : boolean, default True
+            Write row names (index)
+        index_label : string or sequence, default None
+            Column label for index column(s) if desired. If None is given, and
+            `header` and `index` are True, then the index names are used. A
+            sequence should be given if the DataFrame uses MultiIndex.
+        mode : Python write mode, default 'w'
+        sep : character, default ","
+            Field delimiter for the output file.
+        encoding : string, optional
+            a string representing the encoding to use if the contents are
+            non-ascii, for python versions prior to 3
+        """
+        from pandas.core.frame import DataFrame
+        df = DataFrame(self)
+        df.to_csv(path, index=index, sep=sep, na_rep=na_rep, header=header,
+                  index_label=index_label,mode=mode, nanRep=nanRep,
+                  encoding=encoding)
 
     def dropna(self):
         """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -51,6 +51,8 @@
     Indicate number of NA values placed in non-numeric columns
 delimiter : string, default None
     Alternative argument name for sep
+encoding : string, default None
+    Encoding to use for UTF when reading/writing (ex. 'utf-8')
 
 Returns
 -------
@@ -185,6 +187,8 @@ class TextParser(object):
         Row numbers to skip
     skip_footer : int
         Number of line at bottom of file to skip
+    encoding : string, default None
+        Encoding to use for UTF when reading/writing (ex. 'utf-8')
     """
 
     # common NA values