From cd70225efe50d76d4b83a0402c05b36de7ff314a Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 27 Jan 2017 09:55:45 -0800 Subject: [PATCH 1/5] ENH:read_html() handles tables with multiple header rows #13434 --- pandas/io/html.py | 30 ++++++++++++++++++++---------- pandas/tests/io/test_html.py | 22 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 3c38dae91eb89..2d9c5c0f81263 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -355,9 +355,12 @@ def _parse_raw_thead(self, table): thead = self._parse_thead(table) res = [] if thead: - res = lmap(self._text_getter, self._parse_th(thead[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res + trs = self._parse_tr(thead[0]) + for tr in trs: + cols = lmap(self._text_getter, self._parse_td(tr)) + if any([col != '' for col in cols]): + res.append(cols) + return res def _parse_raw_tfoot(self, table): tfoot = self._parse_tfoot(table) @@ -591,9 +594,17 @@ def _parse_tfoot(self, table): return table.xpath('.//tfoot') def _parse_raw_thead(self, table): - expr = './/thead//th' - return [_remove_whitespace(x.text_content()) for x in - table.xpath(expr)] + expr = './/thead' + thead = table.xpath(expr) + res = [] + if thead: + trs = self._parse_tr(thead[0]) + for tr in trs: + cols = [_remove_whitespace(x.text_content()) for x in + self._parse_td(tr)] + if any([col != '' for col in cols]): + res.append(cols) + return res def _parse_raw_tfoot(self, table): expr = './/tfoot//th|//tfoot//td' @@ -615,12 +626,11 @@ def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) - if head: - body = [head] + body - + rows = range(len(head)) + body = head + body if header is None: # special case when a table has elements - header = 0 + header = 0 if rows == [0] else rows if foot: body += [foot] diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 232e68a87f16e..b20806cfbf6b8 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -760,6 +760,17 @@ def test_keep_default_na(self): html_df = read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) + def test_multiple_header_rows(self): + expected_df = DataFrame(data=[("Hillary", 68, "D"), + ("Bernie", 74, "D"), + ("Donald", 69, "R")]) + expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], + ["Name", "Unnamed: 1_level_1", + "Unnamed: 2_level_1"]] + html = expected_df.to_html(index=False) + html_df = read_html(html, )[0] + tm.assert_frame_equal(expected_df, html_df) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') @@ -869,6 +880,17 @@ def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') self.read_html(data, header=[0, 1]) + def test_multiple_header_rows(self): + expected_df = DataFrame(data=[("Hillary", 68, "D"), + ("Bernie", 74, "D"), + ("Donald", 69, "R")]) + expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], + ["Name", "Unnamed: 1_level_1", + "Unnamed: 2_level_1"]] + html = expected_df.to_html(index=False) + html_df = read_html(html, )[0] + tm.assert_frame_equal(expected_df, html_df) + def test_invalid_flavor(): url = 'google.com' From 873ea580ee140f639f36b5d82ffd0c3121ee8861 Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 27 Jan 2017 11:40:54 -0800 Subject: [PATCH 2/5] switched from range to lrange --- pandas/io/html.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 2d9c5c0f81263..0eca411d14847 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -627,7 +627,7 @@ def _data_to_frame(**kwargs): header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: - rows = range(len(head)) + rows = lrange(len(head)) body = head + body if header is None: # special case when a table has elements header = 0 if rows == [0] else rows @@ -637,7 +637,6 @@ def _data_to_frame(**kwargs): # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, **kwargs) df = tp.read() return df From 41fe8cd53fd108d310fd508f3e2f8867601c2443 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 1 Feb 2017 14:31:27 -0800 Subject: [PATCH 3/5] review changes --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/io/test_html.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 75a8752c9bfa4..2cc51c4bf8d2a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -154,6 +154,7 @@ Other enhancements - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) +- ``pd.read_html()`` parses multiple header rows, creating a multiindex header. (:issue:`13434`). .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b20806cfbf6b8..8ad3c51c370ad 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -761,6 +761,7 @@ def test_keep_default_na(self): tm.assert_frame_equal(expected_df, html_df) def test_multiple_header_rows(self): + # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]) @@ -881,6 +882,7 @@ def test_computer_sales_page(self): self.read_html(data, header=[0, 1]) def test_multiple_header_rows(self): + # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]) From 6ae2860cdee4109be8442675a26d0f1a745e2b29 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 1 Feb 2017 16:34:51 -0800 Subject: [PATCH 4/5] updated docstring and io.rst --- doc/source/io.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2d6ddf98437e5..30220dff96d01 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2111,9 +2111,10 @@ Read a URL and match a table that contains specific text match = 'Metcalf Bank' df_list = pd.read_html(url, match=match) -Specify a header row (by default ```` elements are used to form the column -index); if specified, the header row is taken from the data minus the parsed -header elements (```` elements). +Specify a header row (by default ```` or ```` elements located within a +```` are used to form the column index, if multiple rows are contained within +```` then a multiindex is created); if specified, the header row is taken +from the data minus the parsed header elements (```` elements). .. code-block:: python From b54aa0cc8d4afaae33ff57907699d819b9866dde Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 29 Mar 2017 14:04:12 -0700 Subject: [PATCH 5/5] removed duplicate test case --- pandas/tests/io/test_html.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 8ad3c51c370ad..d056ab2e54410 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -881,18 +881,6 @@ def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') self.read_html(data, header=[0, 1]) - def test_multiple_header_rows(self): - # Issue #13434 - expected_df = DataFrame(data=[("Hillary", 68, "D"), - ("Bernie", 74, "D"), - ("Donald", 69, "R")]) - expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], - ["Name", "Unnamed: 1_level_1", - "Unnamed: 2_level_1"]] - html = expected_df.to_html(index=False) - html_df = read_html(html, )[0] - tm.assert_frame_equal(expected_df, html_df) - def test_invalid_flavor(): url = 'google.com'