diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
index bb4ecddd58f16..20620f15944f0 100644
--- a/doc/source/v0.14.1.txt
+++ b/doc/source/v0.14.1.txt
@@ -58,6 +58,9 @@ Known Issues
Enhancements
~~~~~~~~~~~~
- Tests for basic reading of public S3 buckets now exist (:issue:`7281`).
+- ``read_html`` now sports an ``encoding`` argument that is passed to the
+ underlying parser library. You can use this to read non-ascii encoded web
+ pages (:issue:`7323`).
- Support for dateutil timezones, which can now be used in the same way as
pytz timezones across pandas. (:issue:`4688`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4375d08abc37c..5ea6ca36ac764 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
type(skiprows).__name__)
-def _read(io):
+def _read(obj):
"""Try to read from a url, file or string.
Parameters
----------
- io : str, unicode, or file-like
+ obj : str, unicode, or file-like
Returns
-------
raw_text : str
"""
- if _is_url(io):
- with urlopen(io) as url:
- raw_text = url.read()
- elif hasattr(io, 'read'):
- raw_text = io.read()
- elif os.path.isfile(io):
- with open(io) as f:
- raw_text = f.read()
- elif isinstance(io, string_types):
- raw_text = io
+ if _is_url(obj):
+ with urlopen(obj) as url:
+ text = url.read()
+ elif hasattr(obj, 'read'):
+ text = obj.read()
+ elif isinstance(obj, string_types):
+ text = obj
+ try:
+ if os.path.isfile(text):
+ with open(text, 'rb') as f:
+ return f.read()
+ except TypeError:
+ pass
else:
- raise TypeError("Cannot read object of type %r" % type(io).__name__)
- return raw_text
+ raise TypeError("Cannot read object of type %r" % type(obj).__name__)
+ return text
class _HtmlFrameParser(object):
@@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
See each method's respective documentation for details on their
functionality.
"""
- def __init__(self, io, match, attrs):
+ def __init__(self, io, match, attrs, encoding):
self.io = io
self.match = match
self.attrs = attrs
+ self.encoding = encoding
def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -422,7 +426,8 @@ def _setup_build_doc(self):
def _build_doc(self):
from bs4 import BeautifulSoup
- return BeautifulSoup(self._setup_build_doc(), features='html5lib')
+ return BeautifulSoup(self._setup_build_doc(), features='html5lib',
+ from_encoding=self.encoding)
def _build_xpath_expr(attrs):
@@ -519,7 +524,7 @@ def _build_doc(self):
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError
- parser = HTMLParser(recover=False)
+ parser = HTMLParser(recover=False, encoding=self.encoding)
try:
# try to parse the input in the simplest way
@@ -689,7 +694,7 @@ def _validate_flavor(flavor):
def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
- parse_dates, tupleize_cols, thousands, attrs):
+ parse_dates, tupleize_cols, thousands, attrs, encoding):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here
@@ -697,7 +702,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
- p = parser(io, compiled_match, attrs)
+ p = parser(io, compiled_match, attrs, encoding)
try:
tables = p.parse_tables()
@@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, infer_types=None, attrs=None, parse_dates=False,
- tupleize_cols=False, thousands=','):
+ tupleize_cols=False, thousands=',', encoding=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
Parameters
@@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
thousands : str, optional
Separator to use to parse thousands. Defaults to ``','``.
+ encoding : str or None, optional
+ The encoding used to decode the web page. Defaults to ``None``.``None``
+ preserves the previous encoding behavior, which depends on the
+ underlying parser library (e.g., the parser library will try to use
+ the encoding provided by the document).
+
Returns
-------
dfs : list of DataFrames
@@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
- parse_dates, tupleize_cols, thousands, attrs)
+ parse_dates, tupleize_cols, thousands, attrs, encoding)
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf16.html b/pandas/io/tests/data/html_encoding/chinese_utf16.html
new file mode 100644
index 0000000000000..59fffc0d19c57
Binary files /dev/null and b/pandas/io/tests/data/html_encoding/chinese_utf16.html differ
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf32.html b/pandas/io/tests/data/html_encoding/chinese_utf32.html
new file mode 100644
index 0000000000000..365c44bf08ea1
Binary files /dev/null and b/pandas/io/tests/data/html_encoding/chinese_utf32.html differ
diff --git a/pandas/io/tests/data/html_encoding/chinese_utf8.html b/pandas/io/tests/data/html_encoding/chinese_utf8.html
new file mode 100644
index 0000000000000..ad1ca33a78a65
--- /dev/null
+++ b/pandas/io/tests/data/html_encoding/chinese_utf8.html
@@ -0,0 +1,26 @@
+
+
+
+ |
+ 0 |
+ 1 |
+
+
+
+
+ 0 |
+ 漊煻獌 |
+ 漊煻獌 |
+
+
+ 1 |
+ 袟袘觕 |
+ 袟袘觕 |
+
+
+ 2 |
+ 埱娵徖 |
+ 埱娵徖 |
+
+
+
\ No newline at end of file
diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/io/tests/data/html_encoding/letz_latin1.html
new file mode 100644
index 0000000000000..7b4b99cb33388
--- /dev/null
+++ b/pandas/io/tests/data/html_encoding/letz_latin1.html
@@ -0,0 +1,26 @@
+
+
+
+ |
+ 0 |
+ 1 |
+
+
+
+
+ 0 |
+ Gt |
+ Gt |
+
+
+ 1 |
+ m |
+ m |
+
+
+ 2 |
+ iech |
+ iech |
+
+
+
\ No newline at end of file
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index 12da26c0c7e50..a20a8945eeb11 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -1,5 +1,6 @@
from __future__ import print_function
+import glob
import os
import re
import warnings
@@ -110,15 +111,14 @@ def test_to_html_compat(self):
df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
out = df.to_html()
- res = self.read_html(out, attrs={'class': 'dataframe'},
- index_col=0)[0]
+ res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
tm.assert_frame_equal(res, df)
@network
def test_banklist_url(self):
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
df1 = self.read_html(url, 'First Federal Bank of Florida',
- attrs={"id": 'table'})
+ attrs={"id": 'table'})
df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
@@ -135,9 +135,9 @@ def test_spam_url(self):
@slow
def test_banklist(self):
df1 = self.read_html(self.banklist_data, '.*Florida.*',
- attrs={'id': 'table'})
+ attrs={'id': 'table'})
df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
- attrs={'id': 'table'})
+ attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
@@ -183,8 +183,7 @@ def test_skiprows_int(self):
assert_framelist_equal(df1, df2)
def test_skiprows_xrange(self):
- df1 = self.read_html(self.spam_data, '.*Water.*',
- skiprows=range(2))[0]
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
tm.assert_frame_equal(df1, df2)
@@ -195,8 +194,7 @@ def test_skiprows_list(self):
assert_framelist_equal(df1, df2)
def test_skiprows_set(self):
- df1 = self.read_html(self.spam_data, '.*Water.*',
- skiprows=set([1, 2]))
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2]))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1]))
assert_framelist_equal(df1, df2)
@@ -208,23 +206,20 @@ def test_skiprows_slice(self):
assert_framelist_equal(df1, df2)
def test_skiprows_slice_short(self):
- df1 = self.read_html(self.spam_data, '.*Water.*',
- skiprows=slice(2))
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))
assert_framelist_equal(df1, df2)
def test_skiprows_slice_long(self):
- df1 = self.read_html(self.spam_data, '.*Water.*',
- skiprows=slice(2, 5))
- df2 = self.read_html(self.spam_data, 'Unit',
- skiprows=slice(4, 1, -1))
+ df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
+ df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))
assert_framelist_equal(df1, df2)
def test_skiprows_ndarray(self):
df1 = self.read_html(self.spam_data, '.*Water.*',
- skiprows=np.arange(2))
+ skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))
assert_framelist_equal(df1, df2)
@@ -242,30 +237,30 @@ def test_index(self):
def test_header_and_index_no_types(self):
with tm.assert_produces_warning(FutureWarning):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
- index_col=0, infer_types=False)
+ index_col=0, infer_types=False)
with tm.assert_produces_warning(FutureWarning):
- df2 = self.read_html(self.spam_data, 'Unit', header=1,
- index_col=0, infer_types=False)
+ df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0,
+ infer_types=False)
assert_framelist_equal(df1, df2)
def test_header_and_index_with_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
- index_col=0)
+ index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
assert_framelist_equal(df1, df2)
def test_infer_types(self):
with tm.assert_produces_warning(FutureWarning):
df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0,
- infer_types=False)
+ infer_types=False)
with tm.assert_produces_warning(FutureWarning):
df2 = self.read_html(self.spam_data, 'Unit', index_col=0,
- infer_types=False)
+ infer_types=False)
assert_framelist_equal(df1, df2)
with tm.assert_produces_warning(FutureWarning):
df2 = self.read_html(self.spam_data, 'Unit', index_col=0,
- infer_types=True)
+ infer_types=True)
with tm.assertRaises(AssertionError):
assert_framelist_equal(df1, df2)
@@ -308,14 +303,16 @@ def test_bad_url_protocol(self):
def test_invalid_url(self):
try:
with tm.assertRaises(URLError):
- self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*')
+ self.read_html('http://www.a23950sdfa908sd.com',
+ match='.*Water.*')
except ValueError as e:
tm.assert_equal(str(e), 'No tables found')
@slow
def test_file_url(self):
url = self.banklist_data
- dfs = self.read_html(file_path_to_url(url), 'First', attrs={'id': 'table'})
+ dfs = self.read_html(file_path_to_url(url), 'First',
+ attrs={'id': 'table'})
tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)
@@ -367,8 +364,8 @@ def test_multiindex_header_index_skiprows(self):
def test_regex_idempotency(self):
url = self.banklist_data
dfs = self.read_html(file_path_to_url(url),
- match=re.compile(re.compile('Florida')),
- attrs={'id': 'table'})
+ match=re.compile(re.compile('Florida')),
+ attrs={'id': 'table'})
tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)
@@ -381,15 +378,13 @@ def test_negative_skiprows(self):
@network
def test_multiple_matches(self):
url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins'
- dfs = self.read_html(url, match='Python',
- attrs={'class': 'wikitable'})
+ dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'})
self.assertTrue(len(dfs) > 1)
@network
def test_pythonxy_plugins_table(self):
url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins'
- dfs = self.read_html(url, match='Python',
- attrs={'class': 'wikitable'})
+ dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'})
zz = [df.iloc[0, 0] for df in dfs]
self.assertEqual(sorted(zz), sorted(['Python', 'SciTE']))
@@ -471,7 +466,7 @@ def try_remove_ws(x):
return x
df = self.read_html(self.banklist_data, 'Metcalf',
- attrs={'id': 'table'})[0]
+ attrs={'id': 'table'})[0]
ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'),
converters={'Updated Date': Timestamp,
'Closing Date': Timestamp})
@@ -505,7 +500,7 @@ def test_gold_canyon(self):
self.assertIn(gc, raw_text)
df = self.read_html(self.banklist_data, 'Gold Canyon',
- attrs={'id': 'table'})[0]
+ attrs={'id': 'table'})[0]
self.assertIn(gc, df.to_string())
def test_different_number_of_rows(self):
@@ -594,6 +589,35 @@ def test_computer_sales_page(self):
self.read_html(data, infer_types=False, header=[0, 1])
+def _lang_enc(filename):
+ return os.path.splitext(os.path.basename(filename))[0].split('_')
+
+
+class TestReadHtmlEncoding(tm.TestCase):
+ files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html'))
+
+ def read_filename(self, f, encoding):
+ return read_html(f, encoding=encoding, index_col=0)
+
+ def read_file_like(self, f, encoding):
+ with open(f, 'rb') as fobj:
+ return read_html(StringIO(fobj.read()), encoding=encoding,
+ index_col=0)
+
+ def read_string(self, f, encoding):
+ with open(f, 'rb') as fobj:
+ return read_html(fobj.read(), encoding=encoding, index_col=0)
+
+ def test_encode(self):
+ for f in self.files:
+ _, encoding = _lang_enc(f)
+ from_string = self.read_string(f, encoding).pop()
+ from_file_like = self.read_file_like(f, encoding).pop()
+ from_filename = self.read_filename(f, encoding).pop()
+ tm.assert_frame_equal(from_string, from_file_like)
+ tm.assert_frame_equal(from_string, from_filename)
+
+
class TestReadHtmlLxml(tm.TestCase):
@classmethod
def setUpClass(cls):
@@ -644,7 +668,6 @@ def test_parse_dates_combine(self):
tm.assert_frame_equal(newdf, res[0])
def test_computer_sales_page(self):
- import pandas as pd
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
with tm.assert_produces_warning(FutureWarning):
self.read_html(data, infer_types=False, header=[0, 1])