pandas-dev · cpcloud · Jun 4, 2014 · Jun 3, 2014 · jreback · Jun 3, 2014
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -58,6 +58,9 @@ Known Issues
 Enhancements
 ~~~~~~~~~~~~
 - Tests for basic reading of public S3 buckets now exist (:issue:`7281`).
+- ``read_html`` now sports an ``encoding`` argument that is passed to the
+  underlying parser library. You can use this to read non-ascii encoded web
+  pages (:issue:`7323`).
 
 - Support for dateutil timezones, which can now be used in the same way as
   pytz timezones across pandas. (:issue:`4688`)

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
                     type(skiprows).__name__)
 
 
-def _read(io):
+def _read(obj):
     """Try to read from a url, file or string.
 
     Parameters
     ----------
-    io : str, unicode, or file-like
+    obj : str, unicode, or file-like
 
     Returns
     -------
     raw_text : str
     """
-    if _is_url(io):
-        with urlopen(io) as url:
-            raw_text = url.read()
-    elif hasattr(io, 'read'):
-        raw_text = io.read()
-    elif os.path.isfile(io):
-        with open(io) as f:
-            raw_text = f.read()
-    elif isinstance(io, string_types):
-        raw_text = io
+    if _is_url(obj):
+        with urlopen(obj) as url:
+            text = url.read()
+    elif hasattr(obj, 'read'):
+        text = obj.read()
+    elif isinstance(obj, string_types):
+        text = obj
+        try:
+            if os.path.isfile(text):
+                with open(text, 'rb') as f:
+                    return f.read()
+        except TypeError:
+            pass
     else:
-        raise TypeError("Cannot read object of type %r" % type(io).__name__)
-    return raw_text
+        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
+    return text
 
 
 class _HtmlFrameParser(object):
@@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
     See each method's respective documentation for details on their
     functionality.
     """
-    def __init__(self, io, match, attrs):
+    def __init__(self, io, match, attrs, encoding):
         self.io = io
         self.match = match
         self.attrs = attrs
+        self.encoding = encoding
 
     def parse_tables(self):
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -422,7 +426,8 @@ def _setup_build_doc(self):
 
     def _build_doc(self):
         from bs4 import BeautifulSoup
-        return BeautifulSoup(self._setup_build_doc(), features='html5lib')
+        return BeautifulSoup(self._setup_build_doc(), features='html5lib',
+                             from_encoding=self.encoding)
 
 
 def _build_xpath_expr(attrs):
@@ -519,7 +524,7 @@ def _build_doc(self):
         from lxml.html import parse, fromstring, HTMLParser
         from lxml.etree import XMLSyntaxError
 
-        parser = HTMLParser(recover=False)
+        parser = HTMLParser(recover=False, encoding=self.encoding)
 
         try:
             # try to parse the input in the simplest way
@@ -689,15 +694,15 @@ def _validate_flavor(flavor):
 
 
 def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-           parse_dates, tupleize_cols, thousands, attrs):
+           parse_dates, tupleize_cols, thousands, attrs, encoding):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     # hack around python 3 deleting the exception variable
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs)
+        p = parser(io, compiled_match, attrs, encoding)
 
         try:
             tables = p.parse_tables()
@@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
 
 def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, infer_types=None, attrs=None, parse_dates=False,
-              tupleize_cols=False, thousands=','):
+              tupleize_cols=False, thousands=',', encoding=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
     thousands : str, optional
         Separator to use to parse thousands. Defaults to ``','``.
 
+    encoding : str or None, optional
+        The encoding used to decode the web page. Defaults to ``None``.``None``
+        preserves the previous encoding behavior, which depends on the
+        underlying parser library (e.g., the parser library will try to use
+        the encoding provided by the document).
+
     Returns
     -------
     dfs : list of DataFrames
@@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         raise ValueError('cannot skip rows starting from the end of the '
                          'data (you passed a negative value)')
     return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-                  parse_dates, tupleize_cols, thousands, attrs)
+                  parse_dates, tupleize_cols, thousands, attrs, encoding)
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf16.html b/pandas/io/tests/data/html_encoding/chinese_utf16.html
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf32.html b/pandas/io/tests/data/html_encoding/chinese_utf32.html
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf8.html b/pandas/io/tests/data/html_encoding/chinese_utf8.html
@@ -0,0 +1,26 @@
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>0</th>
+      <th>1</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td> 漊煻獌</td>
+      <td> 漊煻獌</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td> 袟袘觕</td>
+      <td> 袟袘觕</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td> 埱娵徖</td>
+      <td> 埱娵徖</td>
+    </tr>
+  </tbody>
+</table>
diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/io/tests/data/html_encoding/letz_latin1.html
@@ -0,0 +1,26 @@
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>0</th>
+      <th>1</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>  G�t</td>
+      <td>  G�t</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>   m�</td>
+      <td>   m�</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td> iech</td>
+      <td> iech</td>
+    </tr>
+  </tbody>
+</table>