From cd70225efe50d76d4b83a0402c05b36de7ff314a Mon Sep 17 00:00:00 2001
From: Brian <sbhuey@gmail.com>
Date: Fri, 27 Jan 2017 09:55:45 -0800
Subject: [PATCH 1/5] ENH:read_html() handles tables with multiple header rows
 #13434

---
 pandas/io/html.py            | 30 ++++++++++++++++++++----------
 pandas/tests/io/test_html.py | 22 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 10 deletions(-)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 3c38dae91eb89..2d9c5c0f81263 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -355,9 +355,12 @@ def _parse_raw_thead(self, table):
         thead = self._parse_thead(table)
         res = []
         if thead:
-            res = lmap(self._text_getter, self._parse_th(thead[0]))
-        return np.atleast_1d(
-            np.array(res).squeeze()) if res and len(res) == 1 else res
+            trs = self._parse_tr(thead[0])
+            for tr in trs:
+                cols = lmap(self._text_getter, self._parse_td(tr))
+                if any([col != '' for col in cols]):
+                    res.append(cols)
+        return res
 
     def _parse_raw_tfoot(self, table):
         tfoot = self._parse_tfoot(table)
@@ -591,9 +594,17 @@ def _parse_tfoot(self, table):
         return table.xpath('.//tfoot')
 
     def _parse_raw_thead(self, table):
-        expr = './/thead//th'
-        return [_remove_whitespace(x.text_content()) for x in
-                table.xpath(expr)]
+        expr = './/thead'
+        thead = table.xpath(expr)
+        res = []
+        if thead:
+            trs = self._parse_tr(thead[0])
+            for tr in trs:
+                cols = [_remove_whitespace(x.text_content()) for x in
+                        self._parse_td(tr)]
+                if any([col != '' for col in cols]):
+                    res.append(cols)
+        return res
 
     def _parse_raw_tfoot(self, table):
         expr = './/tfoot//th|//tfoot//td'
@@ -615,12 +626,11 @@ def _data_to_frame(**kwargs):
     head, body, foot = kwargs.pop('data')
     header = kwargs.pop('header')
     kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
-
     if head:
-        body = [head] + body
-
+        rows = range(len(head))
+        body = head + body
         if header is None:  # special case when a table has <th> elements
-            header = 0
+            header = 0 if rows == [0] else rows
 
     if foot:
         body += [foot]
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 232e68a87f16e..b20806cfbf6b8 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -760,6 +760,17 @@ def test_keep_default_na(self):
         html_df = read_html(html_data, keep_default_na=True)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
+    def test_multiple_header_rows(self):
+        expected_df = DataFrame(data=[("Hillary", 68, "D"),
+                                      ("Bernie", 74, "D"),
+                                      ("Donald", 69, "R")])
+        expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
+                               ["Name", "Unnamed: 1_level_1",
+                                "Unnamed: 2_level_1"]]
+        html = expected_df.to_html(index=False)
+        html_df = read_html(html, )[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
 
 def _lang_enc(filename):
     return os.path.splitext(os.path.basename(filename))[0].split('_')
@@ -869,6 +880,17 @@ def test_computer_sales_page(self):
         data = os.path.join(DATA_PATH, 'computer_sales_page.html')
         self.read_html(data, header=[0, 1])
 
+    def test_multiple_header_rows(self):
+        expected_df = DataFrame(data=[("Hillary", 68, "D"),
+                                      ("Bernie", 74, "D"),
+                                      ("Donald", 69, "R")])
+        expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
+                               ["Name", "Unnamed: 1_level_1",
+                                "Unnamed: 2_level_1"]]
+        html = expected_df.to_html(index=False)
+        html_df = read_html(html, )[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
 
 def test_invalid_flavor():
     url = 'google.com'

From 873ea580ee140f639f36b5d82ffd0c3121ee8861 Mon Sep 17 00:00:00 2001
From: Brian <sbhuey@gmail.com>
Date: Fri, 27 Jan 2017 11:40:54 -0800
Subject: [PATCH 2/5] switched from range to lrange

---
 pandas/io/html.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 2d9c5c0f81263..0eca411d14847 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -627,7 +627,7 @@ def _data_to_frame(**kwargs):
     header = kwargs.pop('header')
     kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
     if head:
-        rows = range(len(head))
+        rows = lrange(len(head))
         body = head + body
         if header is None:  # special case when a table has <th> elements
             header = 0 if rows == [0] else rows
@@ -637,7 +637,6 @@ def _data_to_frame(**kwargs):
 
     # fill out elements of body that are "ragged"
     _expand_elements(body)
-
     tp = TextParser(body, header=header, **kwargs)
     df = tp.read()
     return df

From 41fe8cd53fd108d310fd508f3e2f8867601c2443 Mon Sep 17 00:00:00 2001
From: Brian <sbhuey@gmail.com>
Date: Wed, 1 Feb 2017 14:31:27 -0800
Subject: [PATCH 3/5] review changes

---
 doc/source/whatsnew/v0.20.0.txt | 1 +
 pandas/tests/io/test_html.py    | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 75a8752c9bfa4..2cc51c4bf8d2a 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -154,6 +154,7 @@ Other enhancements
 - ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
 - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`)
 - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`)
+- ``pd.read_html()`` parses multiple header rows, creating a multiindex header. (:issue:`13434`).
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index b20806cfbf6b8..8ad3c51c370ad 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -761,6 +761,7 @@ def test_keep_default_na(self):
         tm.assert_frame_equal(expected_df, html_df)
 
     def test_multiple_header_rows(self):
+        # Issue #13434
         expected_df = DataFrame(data=[("Hillary", 68, "D"),
                                       ("Bernie", 74, "D"),
                                       ("Donald", 69, "R")])
@@ -881,6 +882,7 @@ def test_computer_sales_page(self):
         self.read_html(data, header=[0, 1])
 
     def test_multiple_header_rows(self):
+        # Issue #13434
         expected_df = DataFrame(data=[("Hillary", 68, "D"),
                                       ("Bernie", 74, "D"),
                                       ("Donald", 69, "R")])

From 6ae2860cdee4109be8442675a26d0f1a745e2b29 Mon Sep 17 00:00:00 2001
From: Brian <sbhuey@gmail.com>
Date: Wed, 1 Feb 2017 16:34:51 -0800
Subject: [PATCH 4/5] updated docstring and io.rst

---
 doc/source/io.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 2d6ddf98437e5..30220dff96d01 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2111,9 +2111,10 @@ Read a URL and match a table that contains specific text
    match = 'Metcalf Bank'
    df_list = pd.read_html(url, match=match)
 
-Specify a header row (by default ``<th>`` elements are used to form the column
-index); if specified, the header row is taken from the data minus the parsed
-header elements (``<th>`` elements).
+Specify a header row (by default ``<th>`` or ``<td>`` elements located within a 
+``<thead>`` are used to form the column index, if multiple rows are contained within
+``<thead>`` then a multiindex is created); if specified, the header row is taken
+from the data minus the parsed header elements (``<th>`` elements).
 
 .. code-block:: python
 

From b54aa0cc8d4afaae33ff57907699d819b9866dde Mon Sep 17 00:00:00 2001
From: Brian <sbhuey@gmail.com>
Date: Wed, 29 Mar 2017 14:04:12 -0700
Subject: [PATCH 5/5] removed duplicate test case

---
 pandas/tests/io/test_html.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 8ad3c51c370ad..d056ab2e54410 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -881,18 +881,6 @@ def test_computer_sales_page(self):
         data = os.path.join(DATA_PATH, 'computer_sales_page.html')
         self.read_html(data, header=[0, 1])
 
-    def test_multiple_header_rows(self):
-        # Issue #13434
-        expected_df = DataFrame(data=[("Hillary", 68, "D"),
-                                      ("Bernie", 74, "D"),
-                                      ("Donald", 69, "R")])
-        expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
-                               ["Name", "Unnamed: 1_level_1",
-                                "Unnamed: 2_level_1"]]
-        html = expected_df.to_html(index=False)
-        html_df = read_html(html, )[0]
-        tm.assert_frame_equal(expected_df, html_df)
-
 
 def test_invalid_flavor():
     url = 'google.com'