diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index bb4ecddd58f16..20620f15944f0 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -58,6 +58,9 @@ Known Issues Enhancements ~~~~~~~~~~~~ - Tests for basic reading of public S3 buckets now exist (:issue:`7281`). +- ``read_html`` now sports an ``encoding`` argument that is passed to the + underlying parser library. You can use this to read non-ascii encoded web + pages (:issue:`7323`). - Support for dateutil timezones, which can now be used in the same way as pytz timezones across pandas. (:issue:`4688`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 4375d08abc37c..5ea6ca36ac764 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -98,30 +98,33 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(io): +def _read(obj): """Try to read from a url, file or string. Parameters ---------- - io : str, unicode, or file-like + obj : str, unicode, or file-like Returns ------- raw_text : str """ - if _is_url(io): - with urlopen(io) as url: - raw_text = url.read() - elif hasattr(io, 'read'): - raw_text = io.read() - elif os.path.isfile(io): - with open(io) as f: - raw_text = f.read() - elif isinstance(io, string_types): - raw_text = io + if _is_url(obj): + with urlopen(obj) as url: + text = url.read() + elif hasattr(obj, 'read'): + text = obj.read() + elif isinstance(obj, string_types): + text = obj + try: + if os.path.isfile(text): + with open(text, 'rb') as f: + return f.read() + except TypeError: + pass else: - raise TypeError("Cannot read object of type %r" % type(io).__name__) - return raw_text + raise TypeError("Cannot read object of type %r" % type(obj).__name__) + return text class _HtmlFrameParser(object): @@ -165,10 +168,11 @@ class _HtmlFrameParser(object): See each method's respective documentation for details on their functionality. """ - def __init__(self, io, match, attrs): + def __init__(self, io, match, attrs, encoding): self.io = io self.match = match self.attrs = attrs + self.encoding = encoding def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -422,7 +426,8 @@ def _setup_build_doc(self): def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup(self._setup_build_doc(), features='html5lib') + return BeautifulSoup(self._setup_build_doc(), features='html5lib', + from_encoding=self.encoding) def _build_xpath_expr(attrs): @@ -519,7 +524,7 @@ def _build_doc(self): from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError - parser = HTMLParser(recover=False) + parser = HTMLParser(recover=False, encoding=self.encoding) try: # try to parse the input in the simplest way @@ -689,7 +694,7 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs): + parse_dates, tupleize_cols, thousands, attrs, encoding): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -697,7 +702,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs) + p = parser(io, compiled_match, attrs, encoding) try: tables = p.parse_tables() @@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, infer_types=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=','): + tupleize_cols=False, thousands=',', encoding=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. + encoding : str or None, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + Returns ------- dfs : list of DataFrames @@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands, attrs) + parse_dates, tupleize_cols, thousands, attrs, encoding) diff --git a/pandas/io/tests/data/html_encoding/chinese_utf16.html b/pandas/io/tests/data/html_encoding/chinese_utf16.html new file mode 100644 index 0000000000000..59fffc0d19c57 Binary files /dev/null and b/pandas/io/tests/data/html_encoding/chinese_utf16.html differ diff --git a/pandas/io/tests/data/html_encoding/chinese_utf32.html b/pandas/io/tests/data/html_encoding/chinese_utf32.html new file mode 100644 index 0000000000000..365c44bf08ea1 Binary files /dev/null and b/pandas/io/tests/data/html_encoding/chinese_utf32.html differ diff --git a/pandas/io/tests/data/html_encoding/chinese_utf8.html b/pandas/io/tests/data/html_encoding/chinese_utf8.html new file mode 100644 index 0000000000000..ad1ca33a78a65 --- /dev/null +++ b/pandas/io/tests/data/html_encoding/chinese_utf8.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
01
0 漊煻獌 漊煻獌
1 袟袘觕 袟袘觕
2 埱娵徖 埱娵徖
\ No newline at end of file diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/io/tests/data/html_encoding/letz_latin1.html new file mode 100644 index 0000000000000..7b4b99cb33388 --- /dev/null +++ b/pandas/io/tests/data/html_encoding/letz_latin1.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
01
0 Gt Gt
1 m m
2 iech iech
\ No newline at end of file diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 12da26c0c7e50..a20a8945eeb11 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -1,5 +1,6 @@ from __future__ import print_function +import glob import os import re import warnings @@ -110,15 +111,14 @@ def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap('{0:.3f}'.format).astype(float) out = df.to_html() - res = self.read_html(out, attrs={'class': 'dataframe'}, - index_col=0)[0] + res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] tm.assert_frame_equal(res, df) @network def test_banklist_url(self): url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' df1 = self.read_html(url, 'First Federal Bank of Florida', - attrs={"id": 'table'}) + attrs={"id": 'table'}) df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) @@ -135,9 +135,9 @@ def test_spam_url(self): @slow def test_banklist(self): df1 = self.read_html(self.banklist_data, '.*Florida.*', - attrs={'id': 'table'}) + attrs={'id': 'table'}) df2 = self.read_html(self.banklist_data, 'Metcalf Bank', - attrs={'id': 'table'}) + attrs={'id': 'table'}) assert_framelist_equal(df1, df2) @@ -183,8 +183,7 @@ def test_skiprows_int(self): assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=range(2))[0] + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) @@ -195,8 +194,7 @@ def test_skiprows_list(self): assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=set([1, 2])) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2])) df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) assert_framelist_equal(df1, df2) @@ -208,23 +206,20 @@ def test_skiprows_slice(self): assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=slice(2)) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, 'Unit', - skiprows=slice(4, 1, -1)) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=np.arange(2)) + skiprows=np.arange(2)) df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) assert_framelist_equal(df1, df2) @@ -242,30 +237,30 @@ def test_index(self): def test_header_and_index_no_types(self): with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0, infer_types=False) + index_col=0, infer_types=False) with tm.assert_produces_warning(FutureWarning): - df2 = self.read_html(self.spam_data, 'Unit', header=1, - index_col=0, infer_types=False) + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0, + infer_types=False) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0) + index_col=0) df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0, - infer_types=False) + infer_types=False) with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', index_col=0, - infer_types=False) + infer_types=False) assert_framelist_equal(df1, df2) with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', index_col=0, - infer_types=True) + infer_types=True) with tm.assertRaises(AssertionError): assert_framelist_equal(df1, df2) @@ -308,14 +303,16 @@ def test_bad_url_protocol(self): def test_invalid_url(self): try: with tm.assertRaises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + self.read_html('http://www.a23950sdfa908sd.com', + match='.*Water.*') except ValueError as e: tm.assert_equal(str(e), 'No tables found') @slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', attrs={'id': 'table'}) + dfs = self.read_html(file_path_to_url(url), 'First', + attrs={'id': 'table'}) tm.assert_isinstance(dfs, list) for df in dfs: tm.assert_isinstance(df, DataFrame) @@ -367,8 +364,8 @@ def test_multiindex_header_index_skiprows(self): def test_regex_idempotency(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), - match=re.compile(re.compile('Florida')), - attrs={'id': 'table'}) + match=re.compile(re.compile('Florida')), + attrs={'id': 'table'}) tm.assert_isinstance(dfs, list) for df in dfs: tm.assert_isinstance(df, DataFrame) @@ -381,15 +378,13 @@ def test_negative_skiprows(self): @network def test_multiple_matches(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.read_html(url, match='Python', - attrs={'class': 'wikitable'}) + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) self.assertTrue(len(dfs) > 1) @network def test_pythonxy_plugins_table(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.read_html(url, match='Python', - attrs={'class': 'wikitable'}) + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) zz = [df.iloc[0, 0] for df in dfs] self.assertEqual(sorted(zz), sorted(['Python', 'SciTE'])) @@ -471,7 +466,7 @@ def try_remove_ws(x): return x df = self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'})[0] + attrs={'id': 'table'})[0] ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) @@ -505,7 +500,7 @@ def test_gold_canyon(self): self.assertIn(gc, raw_text) df = self.read_html(self.banklist_data, 'Gold Canyon', - attrs={'id': 'table'})[0] + attrs={'id': 'table'})[0] self.assertIn(gc, df.to_string()) def test_different_number_of_rows(self): @@ -594,6 +589,35 @@ def test_computer_sales_page(self): self.read_html(data, infer_types=False, header=[0, 1]) +def _lang_enc(filename): + return os.path.splitext(os.path.basename(filename))[0].split('_') + + +class TestReadHtmlEncoding(tm.TestCase): + files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html')) + + def read_filename(self, f, encoding): + return read_html(f, encoding=encoding, index_col=0) + + def read_file_like(self, f, encoding): + with open(f, 'rb') as fobj: + return read_html(StringIO(fobj.read()), encoding=encoding, + index_col=0) + + def read_string(self, f, encoding): + with open(f, 'rb') as fobj: + return read_html(fobj.read(), encoding=encoding, index_col=0) + + def test_encode(self): + for f in self.files: + _, encoding = _lang_enc(f) + from_string = self.read_string(f, encoding).pop() + from_file_like = self.read_file_like(f, encoding).pop() + from_filename = self.read_filename(f, encoding).pop() + tm.assert_frame_equal(from_string, from_file_like) + tm.assert_frame_equal(from_string, from_filename) + + class TestReadHtmlLxml(tm.TestCase): @classmethod def setUpClass(cls): @@ -644,7 +668,6 @@ def test_parse_dates_combine(self): tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self): - import pandas as pd data = os.path.join(DATA_PATH, 'computer_sales_page.html') with tm.assert_produces_warning(FutureWarning): self.read_html(data, infer_types=False, header=[0, 1])