From 7230b698d4b85adb31cd761b81333d86e64250d6 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 10:30:57 -0800
Subject: [PATCH 01/13] Converted bs4 class to pytest template

---
 pandas/tests/io/test_html.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 151a0750b7f6e..669eef287c006 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -6,6 +6,8 @@
 import threading
 import warnings
 
+from functools import partial
+
 
 # imports needed for Python 3.x but will fail under Python 2.x
 try:
@@ -91,24 +93,19 @@ def test_bs4_version_fails():
                          flavor='bs4')
 
 
-class ReadHtmlMixin(object):
-
-    def read_html(self, *args, **kwargs):
-        kwargs.setdefault('flavor', self.flavor)
-        return read_html(*args, **kwargs)
-
-
-class TestReadHtml(ReadHtmlMixin):
-    flavor = 'bs4'
+@pytest.mark.parametrize("flavor", [
+    'bs4'], scope="class")
+class TestReadHtml(object):
     spam_data = os.path.join(DATA_PATH, 'spam.html')
     spam_data_kwargs = {}
     if PY3:
         spam_data_kwargs['encoding'] = 'UTF-8'
     banklist_data = os.path.join(DATA_PATH, 'banklist.html')
 
-    @classmethod
-    def setup_class(cls):
-        _skip_if_none_of(('bs4', 'html5lib'))
+    @pytest.fixture(autouse=True, scope="function")
+    def set_defaults(self, flavor, request):
+        self.read_html = partial(read_html, flavor=flavor)
+        yield
 
     def test_to_html_compat(self):
         df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
@@ -838,7 +835,7 @@ def setup_class(cls):
         _skip_if_no(cls.flavor)
 
 
-class TestReadHtmlLxml(ReadHtmlMixin):
+class TestReadHtmlLxml(object):
     flavor = 'lxml'
 
     @classmethod

From 9cb215efcd489f7385e35d5162002a393cb9b2d7 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 15:21:30 -0800
Subject: [PATCH 02/13] Moved all tests to shared class

---
 pandas/io/html.py                  |  43 +++--
 pandas/tests/io/data/banklist.html |   1 +
 pandas/tests/io/test_html.py       | 265 ++++++++++-------------------
 3 files changed, 112 insertions(+), 197 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index be4854bc19cc6..0a7cc22d6c3da 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -510,8 +510,7 @@ def _parse_td(self, row):
         return row.xpath('.//td|.//th')
 
     def _parse_tr(self, table):
-        expr = './/tr[normalize-space()]'
-        return table.xpath(expr)
+        return table.xpath('.//tr')
 
     def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
@@ -551,18 +550,19 @@ def _build_doc(self):
         """
         from lxml.html import parse, fromstring, HTMLParser
         from lxml.etree import XMLSyntaxError
-
-        parser = HTMLParser(recover=False, encoding=self.encoding)
+        parser = HTMLParser(recover=True, encoding=self.encoding)
 
         try:
+            _io = self.io
+            if _is_url(_io):
+                _io = urlopen(_io)
             # try to parse the input in the simplest way
-            r = parse(self.io, parser=parser)
-
+            r = parse(_io, parser=parser)
             try:
                 r = r.getroot()
             except AttributeError:
                 pass
-        except (UnicodeDecodeError, IOError):
+        except (UnicodeDecodeError, IOError) as e:
             # if the input is a blob of html goop
             if not _is_url(self.io):
                 r = fromstring(self.io, parser=parser)
@@ -572,17 +572,7 @@ def _build_doc(self):
                 except AttributeError:
                     pass
             else:
-                # not a url
-                scheme = parse_url(self.io).scheme
-                if scheme not in _valid_schemes:
-                    # lxml can't parse it
-                    msg = (('{invalid!r} is not a valid url scheme, valid '
-                            'schemes are {valid}')
-                           .format(invalid=scheme, valid=_valid_schemes))
-                    raise ValueError(msg)
-                else:
-                    # something else happened: maybe a faulty connection
-                    raise
+                raise e
         else:
             if not hasattr(r, 'text_content'):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -602,12 +592,21 @@ def _parse_raw_thead(self, table):
         thead = table.xpath(expr)
         res = []
         if thead:
-            trs = self._parse_tr(thead[0])
-            for tr in trs:
-                cols = [_remove_whitespace(x.text_content()) for x in
-                        self._parse_td(tr)]
+            # Grab any directly descending table headers first
+            ths = thead[0].xpath('./th')
+            if ths:
+                cols = [_remove_whitespace(x.text_content()) for x in ths]
                 if any(col != '' for col in cols):
                     res.append(cols)
+            else:
+                trs = self._parse_tr(thead[0])
+
+                for tr in trs:
+                    cols = [_remove_whitespace(x.text_content()) for x in
+                            self._parse_td(tr)]
+
+                    if any(col != '' for col in cols):
+                        res.append(cols)
         return res
 
     def _parse_raw_tfoot(self, table):
diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/banklist.html
index cbcce5a2d49ff..c6f0e47c2a3ef 100644
--- a/pandas/tests/io/data/banklist.html
+++ b/pandas/tests/io/data/banklist.html
@@ -340,6 +340,7 @@ <h1 class="page_title">Failed Bank List</h1>
 				<td class="closing">April 19, 2013</td>
 				<td class="updated">April 23, 2013</td>
 			</tr>
+			<tr>
 				<td class="institution"><a href="goldcanyon.html">Gold Canyon Bank</a></td>
 				<td class="city">Gold Canyon</td>
 				<td class="state">AZ</td>
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 669eef287c006..71d40ef84a685 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -93,8 +93,21 @@ def test_bs4_version_fails():
                          flavor='bs4')
 
 
+def test_invalid_flavor():
+    url = 'google.com'
+    with pytest.raises(ValueError):
+        read_html(url, 'google', flavor='not a* valid**++ flaver')
+
+def test_same_ordering():
+    _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
+    filename = os.path.join(DATA_PATH, 'valid_markup.html')
+    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
+    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
+    assert_framelist_equal(dfs_lxml, dfs_bs4)
+
+
 @pytest.mark.parametrize("flavor", [
-    'bs4'], scope="class")
+    'bs4', 'lxml'], scope="class")
 class TestReadHtml(object):
     spam_data = os.path.join(DATA_PATH, 'spam.html')
     spam_data_kwargs = {}
@@ -147,7 +160,6 @@ def test_spam_no_types(self):
         df1 = self.read_html(self.spam_data, '.*Water.*')
         df2 = self.read_html(self.spam_data, 'Unit')
         assert_framelist_equal(df1, df2)
-
         assert df1[0].iloc[0, 0] == 'Proximates'
         assert df1[0].columns[0] == 'Nutrient'
 
@@ -779,69 +791,7 @@ def test_multiple_header_rows(self):
         html_df = read_html(html, )[0]
         tm.assert_frame_equal(expected_df, html_df)
 
-
-def _lang_enc(filename):
-    return os.path.splitext(os.path.basename(filename))[0].split('_')
-
-
-class TestReadHtmlEncoding(object):
-    files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html'))
-    flavor = 'bs4'
-
-    @classmethod
-    def setup_class(cls):
-        _skip_if_none_of((cls.flavor, 'html5lib'))
-
-    def read_html(self, *args, **kwargs):
-        kwargs['flavor'] = self.flavor
-        return read_html(*args, **kwargs)
-
-    def read_filename(self, f, encoding):
-        return self.read_html(f, encoding=encoding, index_col=0)
-
-    def read_file_like(self, f, encoding):
-        with open(f, 'rb') as fobj:
-            return self.read_html(BytesIO(fobj.read()), encoding=encoding,
-                                  index_col=0)
-
-    def read_string(self, f, encoding):
-        with open(f, 'rb') as fobj:
-            return self.read_html(fobj.read(), encoding=encoding, index_col=0)
-
-    def test_encode(self):
-        assert self.files, 'no files read from the data folder'
-        for f in self.files:
-            _, encoding = _lang_enc(f)
-            try:
-                from_string = self.read_string(f, encoding).pop()
-                from_file_like = self.read_file_like(f, encoding).pop()
-                from_filename = self.read_filename(f, encoding).pop()
-                tm.assert_frame_equal(from_string, from_file_like)
-                tm.assert_frame_equal(from_string, from_filename)
-            except Exception:
-                # seems utf-16/32 fail on windows
-                if is_platform_windows():
-                    if '16' in encoding or '32' in encoding:
-                        continue
-                    raise
-
-
-class TestReadHtmlEncodingLxml(TestReadHtmlEncoding):
-    flavor = 'lxml'
-
-    @classmethod
-    def setup_class(cls):
-        super(TestReadHtmlEncodingLxml, cls).setup_class()
-        _skip_if_no(cls.flavor)
-
-
-class TestReadHtmlLxml(object):
-    flavor = 'lxml'
-
-    @classmethod
-    def setup_class(cls):
-        _skip_if_no('lxml')
-
+    @pytest.mark.xfail
     def test_data_fail(self):
         from lxml.etree import XMLSyntaxError
         spam_data = os.path.join(DATA_PATH, 'spam.html')
@@ -861,7 +811,6 @@ def test_works_on_valid_markup(self):
 
     @pytest.mark.slow
     def test_fallback_success(self):
-        _skip_if_none_of(('bs4', 'html5lib'))
         banklist_data = os.path.join(DATA_PATH, 'banklist.html')
         self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib'])
 
@@ -891,136 +840,102 @@ def test_parse_dates_combine(self):
 
     def test_computer_sales_page(self):
         data = os.path.join(DATA_PATH, 'computer_sales_page.html')
-        self.read_html(data, header=[0, 1])
-
-
-def test_invalid_flavor():
-    url = 'google.com'
-    with pytest.raises(ValueError):
-        read_html(url, 'google', flavor='not a* valid**++ flaver')
-
-
-def get_elements_from_file(url, element='table'):
-    _skip_if_none_of(('bs4', 'html5lib'))
-    url = file_path_to_url(url)
-    from bs4 import BeautifulSoup
-    with urlopen(url) as f:
-        soup = BeautifulSoup(f, features='html5lib')
-    return soup.find_all(element)
-
-
-@pytest.mark.slow
-def test_bs4_finds_tables():
-    filepath = os.path.join(DATA_PATH, "spam.html")
-    with warnings.catch_warnings():
-        warnings.filterwarnings('ignore')
-        assert get_elements_from_file(filepath, 'table')
-
-
-def get_lxml_elements(url, element):
-    _skip_if_no('lxml')
-    from lxml.html import parse
-    doc = parse(url)
-    return doc.xpath('.//{0}'.format(element))
-
+        self.read_html(data, header=[1, 2])
 
-@pytest.mark.slow
-def test_lxml_finds_tables():
-    filepath = os.path.join(DATA_PATH, "spam.html")
-    assert get_lxml_elements(filepath, 'table')
+    @pytest.mark.parametrize("f", glob.glob(
+        os.path.join(DATA_PATH, 'html_encoding', '*.html')))
+    def test_encode(self, f):
+        _, encoding = os.path.splitext(os.path.basename(f))[0].split('_')
 
-
-@pytest.mark.slow
-def test_lxml_finds_tbody():
-    filepath = os.path.join(DATA_PATH, "spam.html")
-    assert get_lxml_elements(filepath, 'tbody')
-
-
-def test_same_ordering():
-    _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
-    filename = os.path.join(DATA_PATH, 'valid_markup.html')
-    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
-    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
-    assert_framelist_equal(dfs_lxml, dfs_bs4)
-
-
-class ErrorThread(threading.Thread):
-    def run(self):
         try:
-            super(ErrorThread, self).run()
-        except Exception as e:
-            self.err = e
-        else:
-            self.err = None
+            with open(f, 'rb') as fobj:
+                from_string = self.read_html(fobj.read(), encoding=encoding,
+                                             index_col=0).pop()
 
+            with open(f, 'rb') as fobj:
+                from_file_like = self.read_html(BytesIO(fobj.read()),
+                                                encoding=encoding,
+                                                index_col=0).pop()
 
-@pytest.mark.slow
-def test_importcheck_thread_safety():
-    # see gh-16928
+            from_filename = self.read_html(f, encoding=encoding,
+                                           index_col=0).pop()
+            tm.assert_frame_equal(from_string, from_file_like)
+            tm.assert_frame_equal(from_string, from_filename)
+        except Exception:
+            # seems utf-16/32 fail on windows
+            if is_platform_windows():
+                if '16' in encoding or '32' in encoding:
+                    pytest.skip()
+                raise
 
-    # force import check by reinitalising global vars in html.py
-    pytest.importorskip('lxml')
-    reload(pandas.io.html)
+    def test_parse_failure_unseekable(self):
+        # Issue #17975
 
-    filename = os.path.join(DATA_PATH, 'valid_markup.html')
-    helper_thread1 = ErrorThread(target=read_html, args=(filename,))
-    helper_thread2 = ErrorThread(target=read_html, args=(filename,))
+        if self.read_html.keywords.get('flavor') == 'lxml':
+            pytest.skip("Not applicable for lxml")
 
-    helper_thread1.start()
-    helper_thread2.start()
+        class UnseekableStringIO(StringIO):
+            def seekable(self):
+                return False
 
-    while helper_thread1.is_alive() or helper_thread2.is_alive():
-        pass
-    assert None is helper_thread1.err is helper_thread2.err
+        bad = UnseekableStringIO('''
+            <table><tr><td>spam<foobr />eggs</td></tr></table>''')
 
+        assert self.read_html(bad)
 
-def test_parse_failure_unseekable():
-    # Issue #17975
-    _skip_if_no('lxml')
-    _skip_if_no('bs4')
+        with pytest.raises(ValueError,
+                           match='passed a non-rewindable file object'):
+            self.read_html(bad)
 
-    class UnseekableStringIO(StringIO):
-        def seekable(self):
-            return False
+    def test_parse_failure_rewinds(self):
+        # Issue #17975
 
-    good = UnseekableStringIO('''
-        <table><tr><td>spam<br />eggs</td></tr></table>''')
-    bad = UnseekableStringIO('''
-        <table><tr><td>spam<foobr />eggs</td></tr></table>''')
+        class MockFile(object):
+            def __init__(self, data):
+                self.data = data
+                self.at_end = False
 
-    assert read_html(good)
-    assert read_html(bad, flavor='bs4')
+            def read(self, size=None):
+                data = '' if self.at_end else self.data
+                self.at_end = True
+                return data
 
-    bad.seek(0)
+            def seek(self, offset):
+                self.at_end = False
 
-    with pytest.raises(ValueError,
-                       match='passed a non-rewindable file object'):
-        read_html(bad)
+            def seekable(self):
+                return True
 
+        good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
+        bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
 
-def test_parse_failure_rewinds():
-    # Issue #17975
-    _skip_if_no('lxml')
-    _skip_if_no('bs4')
+        assert self.read_html(good)
+        assert self.read_html(bad)
 
-    class MockFile(object):
-        def __init__(self, data):
-            self.data = data
-            self.at_end = False
 
-        def read(self, size=None):
-            data = '' if self.at_end else self.data
-            self.at_end = True
-            return data
+    @pytest.mark.slow
+    def test_importcheck_thread_safety(self):
+        # see gh-16928
+
+        class ErrorThread(threading.Thread):
+            def run(self):
+                try:
+                    super(ErrorThread, self).run()
+                except Exception as e:
+                    self.err = e
+                else:
+                    self.err = None
 
-        def seek(self, offset):
-            self.at_end = False
+        # force import check by reinitalising global vars in html.py
+        reload(pandas.io.html)
 
-        def seekable(self):
-            return True
+        filename = os.path.join(DATA_PATH, 'valid_markup.html')
+        helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
+        helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))
 
-    good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
-    bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
+        helper_thread1.start()
+        helper_thread2.start()
 
-    assert read_html(good)
-    assert read_html(bad)
+        while helper_thread1.is_alive() or helper_thread2.is_alive():
+            pass
+        assert None is helper_thread1.err is helper_thread2.err

From 8f0ce4d79037545b1b70aa7dec7dc2f61e04972c Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 20:21:43 -0800
Subject: [PATCH 03/13] Added in appropriate skips; cleaned up funcs

---
 pandas/tests/io/test_html.py | 62 ++++++++++--------------------------
 1 file changed, 17 insertions(+), 45 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 71d40ef84a685..3a8a8d4c438b9 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -7,15 +7,7 @@
 import warnings
 
 from functools import partial
-
-
-# imports needed for Python 3.x but will fail under Python 2.x
-try:
-    from importlib import import_module, reload
-except ImportError:
-    import_module = __import__
-
-
+from importlib import reload
 from distutils.version import LooseVersion
 
 import pytest
@@ -33,43 +25,12 @@
 from pandas._libs.parsers import ParserError
 
 import pandas.util.testing as tm
+import pandas.util._test_decorators as td
 from pandas.util.testing import makeCustomDataframe as mkdf, network
 
 
-def _have_module(module_name):
-    try:
-        import_module(module_name)
-        return True
-    except ImportError:
-        return False
-
-
-def _skip_if_no(module_name):
-    if not _have_module(module_name):
-        pytest.skip("{0!r} not found".format(module_name))
-
-
-def _skip_if_none_of(module_names):
-    if isinstance(module_names, string_types):
-        _skip_if_no(module_names)
-        if module_names == 'bs4':
-            import bs4
-            if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-                pytest.skip("Bad version of bs4: 4.2.0")
-    else:
-        not_found = [module_name for module_name in module_names if not
-                     _have_module(module_name)]
-        if set(not_found) & set(module_names):
-            pytest.skip("{0!r} not found".format(not_found))
-        if 'bs4' in module_names:
-            import bs4
-            if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-                pytest.skip("Bad version of bs4: 4.2.0")
-
-
 DATA_PATH = tm.get_data_path()
 
-
 def assert_framelist_equal(list1, list2, *args, **kwargs):
     assert len(list1) == len(list2), ('lists are not of equal size '
                                       'len(list1) == {0}, '
@@ -83,23 +44,31 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
         tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
         assert not frame_i.empty, 'frames are both empty'
 
+def _missing_bs4():
+    bs4 = td.safe_import('bs4')
+    if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
+        return True
+
+    return False
 
+@td.skip_if_no('bs4')
 def test_bs4_version_fails():
-    _skip_if_none_of(('bs4', 'html5lib'))
     import bs4
     if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
         tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
                                                                  "spam.html"),
                          flavor='bs4')
-
+    else:
+        pytest.skip("Only applicable for bs4 version 4.2.0")
 
 def test_invalid_flavor():
     url = 'google.com'
     with pytest.raises(ValueError):
         read_html(url, 'google', flavor='not a* valid**++ flaver')
 
+@td.skip_if_no('bs4')
+@td.skip_if_no('lxml')
 def test_same_ordering():
-    _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
     filename = os.path.join(DATA_PATH, 'valid_markup.html')
     dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
     dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
@@ -107,7 +76,10 @@ def test_same_ordering():
 
 
 @pytest.mark.parametrize("flavor", [
-    'bs4', 'lxml'], scope="class")
+    pytest.param('bs4', marks=pytest.mark.skipif(
+        _missing_bs4(), reason='No bs4')),
+    pytest.param('lxml', marks=pytest.mark.skipif(
+        not td.safe_import('lxml'), reason='No lxml'))], scope="class")
 class TestReadHtml(object):
     spam_data = os.path.join(DATA_PATH, 'spam.html')
     spam_data_kwargs = {}

From 476c19ac41d39d3049efea9f63e3510734f732a3 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 20:28:33 -0800
Subject: [PATCH 04/13] Added reload to compat

---
 pandas/compat/__init__.py    | 4 ++++
 pandas/tests/io/test_html.py | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 78aaf4596c8b7..aefa1ddd6cf0b 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -131,6 +131,9 @@ def lmap(*args, **kwargs):
     def lfilter(*args, **kwargs):
         return list(filter(*args, **kwargs))
 
+    from importlib import reload
+    reload = reload
+
 else:
     # Python 2
     import re
@@ -184,6 +187,7 @@ def get_range_parameters(data):
     lmap = builtins.map
     lfilter = builtins.filter
 
+    reload = builtins.reload
 
 if PY2:
     def iteritems(obj, **kw):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 3a8a8d4c438b9..f0fca753c6406 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -7,7 +7,6 @@
 import warnings
 
 from functools import partial
-from importlib import reload
 from distutils.version import LooseVersion
 
 import pytest
@@ -18,7 +17,7 @@
 from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
                     date_range, Series)
 from pandas.compat import (map, zip, StringIO, string_types, BytesIO,
-                           is_platform_windows, PY3)
+                           is_platform_windows, PY3, reload)
 from pandas.io.common import URLError, urlopen, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html

From 478601c16ff553c5d95465c5e9ff7a6de54fb179 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 20:35:59 -0800
Subject: [PATCH 05/13] LINT fixes

---
 pandas/io/html.py            |  3 +-
 pandas/tests/io/test_html.py | 68 ++++++------------------------------
 2 files changed, 11 insertions(+), 60 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6e9e61182ab48..fd795cd6837ae 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -14,8 +14,7 @@
 
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
-from pandas.io.common import (_is_url, urlopen,
-                              parse_url, _validate_header_arg)
+from pandas.io.common import _is_url, urlopen, _validate_header_arg
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index e9539aa731a3b..27d04ed696885 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -4,7 +4,6 @@
 import os
 import re
 import threading
-import warnings
 
 from functools import partial
 from distutils.version import LooseVersion
@@ -16,9 +15,9 @@
 
 from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
                     date_range, Series)
-from pandas.compat import (map, zip, StringIO, string_types, BytesIO,
+from pandas.compat import (map, zip, StringIO, BytesIO,
                            is_platform_windows, PY3, reload)
-from pandas.io.common import URLError, urlopen, file_path_to_url
+from pandas.io.common import URLError, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
 from pandas._libs.parsers import ParserError
@@ -30,6 +29,7 @@
 
 DATA_PATH = tm.get_data_path()
 
+
 def assert_framelist_equal(list1, list2, *args, **kwargs):
     assert len(list1) == len(list2), ('lists are not of equal size '
                                       'len(list1) == {0}, '
@@ -43,6 +43,7 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
         tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
         assert not frame_i.empty, 'frames are both empty'
 
+
 def _missing_bs4():
     bs4 = td.safe_import('bs4')
     if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
@@ -50,6 +51,7 @@ def _missing_bs4():
 
     return False
 
+
 @td.skip_if_no('bs4')
 def test_bs4_version_fails():
     import bs4
@@ -60,11 +62,13 @@ def test_bs4_version_fails():
     else:
         pytest.skip("Only applicable for bs4 version 4.2.0")
 
+
 def test_invalid_flavor():
     url = 'google.com'
     with pytest.raises(ValueError):
         read_html(url, 'google', flavor='not a* valid**++ flaver')
 
+
 @td.skip_if_no('bs4')
 @td.skip_if_no('lxml')
 def test_same_ordering():
@@ -647,6 +651,9 @@ def test_computer_sales_page(self):
                                     r"multi_index of columns"):
             self.read_html(data, header=[0, 1])
 
+        data = os.path.join(DATA_PATH, 'computer_sales_page.html')
+        assert self.read_html(data, header=[1, 2])
+
     def test_wikipedia_states_table(self):
         data = os.path.join(DATA_PATH, 'wikipedia_states.html')
         assert os.path.isfile(data), '%r is not a file' % data
@@ -654,39 +661,6 @@ def test_wikipedia_states_table(self):
         result = self.read_html(data, 'Arizona', header=1)[0]
         assert result['sq mi'].dtype == np.dtype('float64')
 
-    @pytest.mark.parametrize("displayed_only,exp0,exp1", [
-        (True, DataFrame(["foo"]), None),
-        (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"]))])
-    def test_displayed_only(self, displayed_only, exp0, exp1):
-        # GH 20027
-        data = StringIO("""<html>
-          <body>
-            <table>
-              <tr>
-                <td>
-                  foo
-                  <span style="display:none;text-align:center">bar</span>
-                  <span style="display:none">baz</span>
-                  <span style="display: none">qux</span>
-                </td>
-              </tr>
-            </table>
-            <table style="display: none">
-              <tr>
-                <td>foo</td>
-              </tr>
-            </table>
-          </body>
-        </html>""")
-
-        dfs = self.read_html(data, displayed_only=displayed_only)
-        tm.assert_frame_equal(dfs[0], exp0)
-
-        if exp1 is not None:
-            tm.assert_frame_equal(dfs[1], exp1)
-        else:
-            assert len(dfs) == 1  # Should not parse hidden table
-
     def test_decimal_rows(self):
 
         # GH 12907
@@ -825,27 +799,6 @@ def test_to_html_timestamp(self):
         result = df.to_html()
         assert '2000-01-01' in result
 
-    def test_parse_dates_list(self):
-        df = DataFrame({'date': date_range('1/1/2001', periods=10)})
-        expected = df.to_html()
-        res = self.read_html(expected, parse_dates=[1], index_col=0)
-        tm.assert_frame_equal(df, res[0])
-        res = self.read_html(expected, parse_dates=['date'], index_col=0)
-        tm.assert_frame_equal(df, res[0])
-
-    def test_parse_dates_combine(self):
-        raw_dates = Series(date_range('1/1/2001', periods=10))
-        df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
-                        'time': raw_dates.map(lambda x: str(x.time()))})
-        res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
-                             index_col=1)
-        newdf = DataFrame({'datetime': raw_dates})
-        tm.assert_frame_equal(newdf, res[0])
-
-    def test_computer_sales_page(self):
-        data = os.path.join(DATA_PATH, 'computer_sales_page.html')
-        self.read_html(data, header=[1, 2])
-
     @pytest.mark.parametrize("displayed_only,exp0,exp1", [
         (True, DataFrame(["foo"]), None),
         (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"]))])
@@ -949,7 +902,6 @@ def seekable(self):
         assert self.read_html(good)
         assert self.read_html(bad)
 
-
     @pytest.mark.slow
     def test_importcheck_thread_safety(self):
         # see gh-16928

From 23602249b9a0b4f6ec2a9bf8b5b5411ee43719d4 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Sun, 11 Mar 2018 21:01:47 -0700
Subject: [PATCH 06/13] Py27 compat

---
 pandas/io/html.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index fd795cd6837ae..5186b5fce41b1 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -607,11 +607,12 @@ def _build_doc(self):
         parser = HTMLParser(recover=True, encoding=self.encoding)
 
         try:
-            _io = self.io
-            if _is_url(_io):
-                _io = urlopen(_io)
-            # try to parse the input in the simplest way
-            r = parse(_io, parser=parser)
+            if _is_url(self.io):
+                with urlopen(self.io) as f:
+                    r = parse(f, parser=parser)
+            else:
+                # try to parse the input in the simplest way
+                r = parse(self.io, parser=parser)
             try:
                 r = r.getroot()
             except AttributeError:

From a93a5a3b4a85cc2c12297789d8bc1a79575035cc Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Mon, 12 Mar 2018 23:51:57 -0700
Subject: [PATCH 07/13] Increased bs4 min version req

---
 ci/requirements-optional-conda.txt |  2 +-
 ci/requirements-optional-pip.txt   |  2 +-
 pandas/io/html.py                  | 12 ++++--------
 pandas/tests/io/test_html.py       | 12 ++++--------
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
index 6edb8d17337e4..65357ce2018d2 100644
--- a/ci/requirements-optional-conda.txt
+++ b/ci/requirements-optional-conda.txt
@@ -1,4 +1,4 @@
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
index 8d4421ba2b681..43c7d47892095 100644
--- a/ci/requirements-optional-pip.txt
+++ b/ci/requirements-optional-pip.txt
@@ -1,6 +1,6 @@
 # This file was autogenerated by scripts/convert_deps.py
 # Do not modify directly
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 5186b5fce41b1..ba5da1b4e3a76 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -738,14 +738,10 @@ def _parser_dispatch(flavor):
             raise ImportError(
                 "BeautifulSoup4 (bs4) not found, please install it")
         import bs4
-        if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-            raise ValueError("You're using a version"
-                             " of BeautifulSoup4 (4.2.0) that has been"
-                             " known to cause problems on certain"
-                             " operating systems such as Debian. "
-                             "Please install a version of"
-                             " BeautifulSoup4 != 4.2.0, both earlier"
-                             " and later releases will work.")
+        if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
+            raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
+                             "is required")
+
     else:
         if not _HAS_LXML:
             raise ImportError("lxml not found, please install it")
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 27d04ed696885..d40c45c969442 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -53,15 +53,11 @@ def _missing_bs4():
 
 
 @td.skip_if_no('bs4')
-def test_bs4_version_fails():
+def test_bs4_version_fails(monkeypatch):
     import bs4
-    if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-        tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
-                                                                 "spam.html"),
-                         flavor='bs4')
-    else:
-        pytest.skip("Only applicable for bs4 version 4.2.0")
-
+    monkeypatch.setattr(bs4, '__version__', '4.2')
+    with tm.assert_raises_regex(ValueError, "minimum version"):
+        read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4')
 
 def test_invalid_flavor():
     url = 'google.com'

From 29904d1ca5dd270d7e3ff1bb0cea9630fa8a9b88 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Mon, 12 Mar 2018 23:52:34 -0700
Subject: [PATCH 08/13] Removed xfail test for lxml

---
 pandas/tests/io/test_html.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index d40c45c969442..39b1f8bcea649 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -765,18 +765,6 @@ def test_multiple_header_rows(self):
         html_df = read_html(html, )[0]
         tm.assert_frame_equal(expected_df, html_df)
 
-    @pytest.mark.xfail
-    def test_data_fail(self):
-        from lxml.etree import XMLSyntaxError
-        spam_data = os.path.join(DATA_PATH, 'spam.html')
-        banklist_data = os.path.join(DATA_PATH, 'banklist.html')
-
-        with pytest.raises(XMLSyntaxError):
-            self.read_html(spam_data)
-
-        with pytest.raises(XMLSyntaxError):
-            self.read_html(banklist_data)
-
     def test_works_on_valid_markup(self):
         filename = os.path.join(DATA_PATH, 'valid_markup.html')
         dfs = self.read_html(filename, index_col=0)

From f488fc8c07efaf94f35404597702a64d7099093d Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Mon, 12 Mar 2018 23:54:47 -0700
Subject: [PATCH 09/13] LINTing

---
 pandas/tests/io/test_html.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 39b1f8bcea649..0b021cc01beae 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -59,6 +59,7 @@ def test_bs4_version_fails(monkeypatch):
     with tm.assert_raises_regex(ValueError, "minimum version"):
         read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4')
 
+
 def test_invalid_flavor():
     url = 'google.com'
     with pytest.raises(ValueError):

From 41b77e1a22ea6eaea349c9218d701e7ba1dca206 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 13 Mar 2018 10:56:34 -0700
Subject: [PATCH 10/13] Clean up unnecessary test

---
 pandas/tests/io/test_html.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 0b021cc01beae..f18fc2e91e266 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -44,14 +44,6 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
         assert not frame_i.empty, 'frames are both empty'
 
 
-def _missing_bs4():
-    bs4 = td.safe_import('bs4')
-    if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-        return True
-
-    return False
-
-
 @td.skip_if_no('bs4')
 def test_bs4_version_fails(monkeypatch):
     import bs4
@@ -77,7 +69,7 @@ def test_same_ordering():
 
 @pytest.mark.parametrize("flavor", [
     pytest.param('bs4', marks=pytest.mark.skipif(
-        _missing_bs4(), reason='No bs4')),
+        not td.safe_import('lxml'), reason='No bs4')),
     pytest.param('lxml', marks=pytest.mark.skipif(
         not td.safe_import('lxml'), reason='No lxml'))], scope="class")
 class TestReadHtml(object):

From d44b164c967ec99da8dd9ae5a0d1fedca447c6ca Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 13 Mar 2018 11:17:15 -0700
Subject: [PATCH 11/13] Updated documentation

---
 doc/source/install.rst          |  9 ++++++---
 doc/source/whatsnew/v0.23.0.txt | 16 +++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/doc/source/install.rst b/doc/source/install.rst
index 07f57dbd65709..7d741c6c2c75a 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -266,6 +266,12 @@ Optional Dependencies
 * One of the following combinations of libraries is needed to use the
   top-level :func:`~pandas.read_html` function:
 
+  .. versionchanged:: 0.23.0
+
+  .. note::
+
+     If using BeautifulSoup4 a minimum version of 4.2.1 is required
+
   * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is
     okay.)
   * `BeautifulSoup4`_ and `lxml`_
@@ -282,9 +288,6 @@ Optional Dependencies
      * You are highly encouraged to read :ref:`HTML Table Parsing gotchas <io.html.gotchas>`.
        It explains issues surrounding the installation and
        usage of the above three libraries.
-     * You may need to install an older version of `BeautifulSoup4`_:
-       Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit
-       Ubuntu/Debian
 
   .. note::
 
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index f686a042c1a74..c6683edef7272 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -358,13 +358,15 @@ Dependencies have increased minimum versions
 We have updated our minimum supported versions of dependencies (:issue:`15184`).
 If installed, we now require:
 
-+-----------------+-----------------+----------+
-| Package         | Minimum Version | Required |
-+=================+=================+==========+
-| python-dateutil | 2.5.0           |    X     |
-+-----------------+-----------------+----------+
-| openpyxl        | 2.4.0           |          |
-+-----------------+-----------------+----------+
++-----------------+-----------------+----------+---------------+
+| Package         | Minimum Version | Required |     Issue     |
++=================+=================+==========+===============+
+| python-dateutil | 2.5.0           |    X     | :issue:`15184`|
++-----------------+-----------------+----------+---------------+
+| openpyxl        | 2.4.0           |          | :issue:`15184`|
++-----------------+-----------------+----------+---------------+
+| beautifulsoup4  | 4.2.1           |          | :issue:`20082`|
++-----------------+-----------------+----------+---------------+
 
 .. _whatsnew_0230.api_breaking.dict_insertion_order:
 

From e6943b1a48ccd51fae4e60c3f232d1337b0ad8e2 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 13 Mar 2018 16:37:29 -0700
Subject: [PATCH 12/13] Bumped bs4 build req

---
 ci/requirements-2.7_COMPAT.pip | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/requirements-2.7_COMPAT.pip b/ci/requirements-2.7_COMPAT.pip
index 13cd35a923124..0e154dbc07525 100644
--- a/ci/requirements-2.7_COMPAT.pip
+++ b/ci/requirements-2.7_COMPAT.pip
@@ -1,4 +1,4 @@
 html5lib==1.0b2
-beautifulsoup4==4.2.0
+beautifulsoup4==4.2.1
 openpyxl
 argparse

From 50d072dbdce2bd44d1b83044743c1e511f7c1f25 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 13 Mar 2018 16:38:32 -0700
Subject: [PATCH 13/13] LINT fix

---
 pandas/tests/io/test_html.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index f18fc2e91e266..79b9a3715efd2 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -6,7 +6,6 @@
 import threading
 
 from functools import partial
-from distutils.version import LooseVersion
 
 import pytest