From 7230b698d4b85adb31cd761b81333d86e64250d6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 10:30:57 -0800 Subject: [PATCH 01/13] Converted bs4 class to pytest template --- pandas/tests/io/test_html.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 151a0750b7f6e..669eef287c006 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -6,6 +6,8 @@ import threading import warnings +from functools import partial + # imports needed for Python 3.x but will fail under Python 2.x try: @@ -91,24 +93,19 @@ def test_bs4_version_fails(): flavor='bs4') -class ReadHtmlMixin(object): - - def read_html(self, *args, **kwargs): - kwargs.setdefault('flavor', self.flavor) - return read_html(*args, **kwargs) - - -class TestReadHtml(ReadHtmlMixin): - flavor = 'bs4' +@pytest.mark.parametrize("flavor", [ + 'bs4'], scope="class") +class TestReadHtml(object): spam_data = os.path.join(DATA_PATH, 'spam.html') spam_data_kwargs = {} if PY3: spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') - @classmethod - def setup_class(cls): - _skip_if_none_of(('bs4', 'html5lib')) + @pytest.fixture(autouse=True, scope="function") + def set_defaults(self, flavor, request): + self.read_html = partial(read_html, flavor=flavor) + yield def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, @@ -838,7 +835,7 @@ def setup_class(cls): _skip_if_no(cls.flavor) -class TestReadHtmlLxml(ReadHtmlMixin): +class TestReadHtmlLxml(object): flavor = 'lxml' @classmethod From 9cb215efcd489f7385e35d5162002a393cb9b2d7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 15:21:30 -0800 Subject: [PATCH 02/13] Moved all tests to shared class --- pandas/io/html.py | 43 +++-- pandas/tests/io/data/banklist.html | 1 + pandas/tests/io/test_html.py | 265 ++++++++++------------------- 3 files changed, 112 insertions(+), 197 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index be4854bc19cc6..0a7cc22d6c3da 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -510,8 +510,7 @@ def _parse_td(self, row): return row.xpath('.//td|.//th') def _parse_tr(self, table): - expr = './/tr[normalize-space()]' - return table.xpath(expr) + return table.xpath('.//tr') def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -551,18 +550,19 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError - - parser = HTMLParser(recover=False, encoding=self.encoding) + parser = HTMLParser(recover=True, encoding=self.encoding) try: + _io = self.io + if _is_url(_io): + _io = urlopen(_io) # try to parse the input in the simplest way - r = parse(self.io, parser=parser) - + r = parse(_io, parser=parser) try: r = r.getroot() except AttributeError: pass - except (UnicodeDecodeError, IOError): + except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) @@ -572,17 +572,7 @@ def _build_doc(self): except AttributeError: pass else: - # not a url - scheme = parse_url(self.io).scheme - if scheme not in _valid_schemes: - # lxml can't parse it - msg = (('{invalid!r} is not a valid url scheme, valid ' - 'schemes are {valid}') - .format(invalid=scheme, valid=_valid_schemes)) - raise ValueError(msg) - else: - # something else happened: maybe a faulty connection - raise + raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) @@ -602,12 +592,21 @@ def _parse_raw_thead(self, table): thead = table.xpath(expr) res = [] if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + # Grab any directly descending table headers first + ths = thead[0].xpath('./th') + if ths: + cols = [_remove_whitespace(x.text_content()) for x in ths] if any(col != '' for col in cols): res.append(cols) + else: + trs = self._parse_tr(thead[0]) + + for tr in trs: + cols = [_remove_whitespace(x.text_content()) for x in + self._parse_td(tr)] + + if any(col != '' for col in cols): + res.append(cols) return res def _parse_raw_tfoot(self, table): diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/banklist.html index cbcce5a2d49ff..c6f0e47c2a3ef 100644 --- a/pandas/tests/io/data/banklist.html +++ b/pandas/tests/io/data/banklist.html @@ -340,6 +340,7 @@

Failed Bank List

April 19, 2013 April 23, 2013 + Gold Canyon Bank Gold Canyon AZ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 669eef287c006..71d40ef84a685 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -93,8 +93,21 @@ def test_bs4_version_fails(): flavor='bs4') +def test_invalid_flavor(): + url = 'google.com' + with pytest.raises(ValueError): + read_html(url, 'google', flavor='not a* valid**++ flaver') + +def test_same_ordering(): + _skip_if_none_of(['bs4', 'lxml', 'html5lib']) + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) + dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + assert_framelist_equal(dfs_lxml, dfs_bs4) + + @pytest.mark.parametrize("flavor", [ - 'bs4'], scope="class") + 'bs4', 'lxml'], scope="class") class TestReadHtml(object): spam_data = os.path.join(DATA_PATH, 'spam.html') spam_data_kwargs = {} @@ -147,7 +160,6 @@ def test_spam_no_types(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' assert df1[0].columns[0] == 'Nutrient' @@ -779,69 +791,7 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - -def _lang_enc(filename): - return os.path.splitext(os.path.basename(filename))[0].split('_') - - -class TestReadHtmlEncoding(object): - files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html')) - flavor = 'bs4' - - @classmethod - def setup_class(cls): - _skip_if_none_of((cls.flavor, 'html5lib')) - - def read_html(self, *args, **kwargs): - kwargs['flavor'] = self.flavor - return read_html(*args, **kwargs) - - def read_filename(self, f, encoding): - return self.read_html(f, encoding=encoding, index_col=0) - - def read_file_like(self, f, encoding): - with open(f, 'rb') as fobj: - return self.read_html(BytesIO(fobj.read()), encoding=encoding, - index_col=0) - - def read_string(self, f, encoding): - with open(f, 'rb') as fobj: - return self.read_html(fobj.read(), encoding=encoding, index_col=0) - - def test_encode(self): - assert self.files, 'no files read from the data folder' - for f in self.files: - _, encoding = _lang_enc(f) - try: - from_string = self.read_string(f, encoding).pop() - from_file_like = self.read_file_like(f, encoding).pop() - from_filename = self.read_filename(f, encoding).pop() - tm.assert_frame_equal(from_string, from_file_like) - tm.assert_frame_equal(from_string, from_filename) - except Exception: - # seems utf-16/32 fail on windows - if is_platform_windows(): - if '16' in encoding or '32' in encoding: - continue - raise - - -class TestReadHtmlEncodingLxml(TestReadHtmlEncoding): - flavor = 'lxml' - - @classmethod - def setup_class(cls): - super(TestReadHtmlEncodingLxml, cls).setup_class() - _skip_if_no(cls.flavor) - - -class TestReadHtmlLxml(object): - flavor = 'lxml' - - @classmethod - def setup_class(cls): - _skip_if_no('lxml') - + @pytest.mark.xfail def test_data_fail(self): from lxml.etree import XMLSyntaxError spam_data = os.path.join(DATA_PATH, 'spam.html') @@ -861,7 +811,6 @@ def test_works_on_valid_markup(self): @pytest.mark.slow def test_fallback_success(self): - _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) @@ -891,136 +840,102 @@ def test_parse_dates_combine(self): def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') - self.read_html(data, header=[0, 1]) - - -def test_invalid_flavor(): - url = 'google.com' - with pytest.raises(ValueError): - read_html(url, 'google', flavor='not a* valid**++ flaver') - - -def get_elements_from_file(url, element='table'): - _skip_if_none_of(('bs4', 'html5lib')) - url = file_path_to_url(url) - from bs4 import BeautifulSoup - with urlopen(url) as f: - soup = BeautifulSoup(f, features='html5lib') - return soup.find_all(element) - - -@pytest.mark.slow -def test_bs4_finds_tables(): - filepath = os.path.join(DATA_PATH, "spam.html") - with warnings.catch_warnings(): - warnings.filterwarnings('ignore') - assert get_elements_from_file(filepath, 'table') - - -def get_lxml_elements(url, element): - _skip_if_no('lxml') - from lxml.html import parse - doc = parse(url) - return doc.xpath('.//{0}'.format(element)) - + self.read_html(data, header=[1, 2]) -@pytest.mark.slow -def test_lxml_finds_tables(): - filepath = os.path.join(DATA_PATH, "spam.html") - assert get_lxml_elements(filepath, 'table') + @pytest.mark.parametrize("f", glob.glob( + os.path.join(DATA_PATH, 'html_encoding', '*.html'))) + def test_encode(self, f): + _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') - -@pytest.mark.slow -def test_lxml_finds_tbody(): - filepath = os.path.join(DATA_PATH, "spam.html") - assert get_lxml_elements(filepath, 'tbody') - - -def test_same_ordering(): - _skip_if_none_of(['bs4', 'lxml', 'html5lib']) - filename = os.path.join(DATA_PATH, 'valid_markup.html') - dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) - dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) - assert_framelist_equal(dfs_lxml, dfs_bs4) - - -class ErrorThread(threading.Thread): - def run(self): try: - super(ErrorThread, self).run() - except Exception as e: - self.err = e - else: - self.err = None + with open(f, 'rb') as fobj: + from_string = self.read_html(fobj.read(), encoding=encoding, + index_col=0).pop() + with open(f, 'rb') as fobj: + from_file_like = self.read_html(BytesIO(fobj.read()), + encoding=encoding, + index_col=0).pop() -@pytest.mark.slow -def test_importcheck_thread_safety(): - # see gh-16928 + from_filename = self.read_html(f, encoding=encoding, + index_col=0).pop() + tm.assert_frame_equal(from_string, from_file_like) + tm.assert_frame_equal(from_string, from_filename) + except Exception: + # seems utf-16/32 fail on windows + if is_platform_windows(): + if '16' in encoding or '32' in encoding: + pytest.skip() + raise - # force import check by reinitalising global vars in html.py - pytest.importorskip('lxml') - reload(pandas.io.html) + def test_parse_failure_unseekable(self): + # Issue #17975 - filename = os.path.join(DATA_PATH, 'valid_markup.html') - helper_thread1 = ErrorThread(target=read_html, args=(filename,)) - helper_thread2 = ErrorThread(target=read_html, args=(filename,)) + if self.read_html.keywords.get('flavor') == 'lxml': + pytest.skip("Not applicable for lxml") - helper_thread1.start() - helper_thread2.start() + class UnseekableStringIO(StringIO): + def seekable(self): + return False - while helper_thread1.is_alive() or helper_thread2.is_alive(): - pass - assert None is helper_thread1.err is helper_thread2.err + bad = UnseekableStringIO(''' +
spameggs
''') + assert self.read_html(bad) -def test_parse_failure_unseekable(): - # Issue #17975 - _skip_if_no('lxml') - _skip_if_no('bs4') + with pytest.raises(ValueError, + match='passed a non-rewindable file object'): + self.read_html(bad) - class UnseekableStringIO(StringIO): - def seekable(self): - return False + def test_parse_failure_rewinds(self): + # Issue #17975 - good = UnseekableStringIO(''' -
spam
eggs
''') - bad = UnseekableStringIO(''' -
spameggs
''') + class MockFile(object): + def __init__(self, data): + self.data = data + self.at_end = False - assert read_html(good) - assert read_html(bad, flavor='bs4') + def read(self, size=None): + data = '' if self.at_end else self.data + self.at_end = True + return data - bad.seek(0) + def seek(self, offset): + self.at_end = False - with pytest.raises(ValueError, - match='passed a non-rewindable file object'): - read_html(bad) + def seekable(self): + return True + good = MockFile('
spam
eggs
') + bad = MockFile('
spameggs
') -def test_parse_failure_rewinds(): - # Issue #17975 - _skip_if_no('lxml') - _skip_if_no('bs4') + assert self.read_html(good) + assert self.read_html(bad) - class MockFile(object): - def __init__(self, data): - self.data = data - self.at_end = False - def read(self, size=None): - data = '' if self.at_end else self.data - self.at_end = True - return data + @pytest.mark.slow + def test_importcheck_thread_safety(self): + # see gh-16928 + + class ErrorThread(threading.Thread): + def run(self): + try: + super(ErrorThread, self).run() + except Exception as e: + self.err = e + else: + self.err = None - def seek(self, offset): - self.at_end = False + # force import check by reinitalising global vars in html.py + reload(pandas.io.html) - def seekable(self): - return True + filename = os.path.join(DATA_PATH, 'valid_markup.html') + helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) - good = MockFile('
spam
eggs
') - bad = MockFile('
spameggs
') + helper_thread1.start() + helper_thread2.start() - assert read_html(good) - assert read_html(bad) + while helper_thread1.is_alive() or helper_thread2.is_alive(): + pass + assert None is helper_thread1.err is helper_thread2.err From 8f0ce4d79037545b1b70aa7dec7dc2f61e04972c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 20:21:43 -0800 Subject: [PATCH 03/13] Added in appropriate skips; cleaned up funcs --- pandas/tests/io/test_html.py | 62 ++++++++++-------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 71d40ef84a685..3a8a8d4c438b9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -7,15 +7,7 @@ import warnings from functools import partial - - -# imports needed for Python 3.x but will fail under Python 2.x -try: - from importlib import import_module, reload -except ImportError: - import_module = __import__ - - +from importlib import reload from distutils.version import LooseVersion import pytest @@ -33,43 +25,12 @@ from pandas._libs.parsers import ParserError import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network -def _have_module(module_name): - try: - import_module(module_name) - return True - except ImportError: - return False - - -def _skip_if_no(module_name): - if not _have_module(module_name): - pytest.skip("{0!r} not found".format(module_name)) - - -def _skip_if_none_of(module_names): - if isinstance(module_names, string_types): - _skip_if_no(module_names) - if module_names == 'bs4': - import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - pytest.skip("Bad version of bs4: 4.2.0") - else: - not_found = [module_name for module_name in module_names if not - _have_module(module_name)] - if set(not_found) & set(module_names): - pytest.skip("{0!r} not found".format(not_found)) - if 'bs4' in module_names: - import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - pytest.skip("Bad version of bs4: 4.2.0") - - DATA_PATH = tm.get_data_path() - def assert_framelist_equal(list1, list2, *args, **kwargs): assert len(list1) == len(list2), ('lists are not of equal size ' 'len(list1) == {0}, ' @@ -83,23 +44,31 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, 'frames are both empty' +def _missing_bs4(): + bs4 = td.safe_import('bs4') + if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): + return True + + return False +@td.skip_if_no('bs4') def test_bs4_version_fails(): - _skip_if_none_of(('bs4', 'html5lib')) import bs4 if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, "spam.html"), flavor='bs4') - + else: + pytest.skip("Only applicable for bs4 version 4.2.0") def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): read_html(url, 'google', flavor='not a* valid**++ flaver') +@td.skip_if_no('bs4') +@td.skip_if_no('lxml') def test_same_ordering(): - _skip_if_none_of(['bs4', 'lxml', 'html5lib']) filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) @@ -107,7 +76,10 @@ def test_same_ordering(): @pytest.mark.parametrize("flavor", [ - 'bs4', 'lxml'], scope="class") + pytest.param('bs4', marks=pytest.mark.skipif( + _missing_bs4(), reason='No bs4')), + pytest.param('lxml', marks=pytest.mark.skipif( + not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): spam_data = os.path.join(DATA_PATH, 'spam.html') spam_data_kwargs = {} From 476c19ac41d39d3049efea9f63e3510734f732a3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 20:28:33 -0800 Subject: [PATCH 04/13] Added reload to compat --- pandas/compat/__init__.py | 4 ++++ pandas/tests/io/test_html.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 78aaf4596c8b7..aefa1ddd6cf0b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -131,6 +131,9 @@ def lmap(*args, **kwargs): def lfilter(*args, **kwargs): return list(filter(*args, **kwargs)) + from importlib import reload + reload = reload + else: # Python 2 import re @@ -184,6 +187,7 @@ def get_range_parameters(data): lmap = builtins.map lfilter = builtins.filter + reload = builtins.reload if PY2: def iteritems(obj, **kw): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 3a8a8d4c438b9..f0fca753c6406 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -7,7 +7,6 @@ import warnings from functools import partial -from importlib import reload from distutils.version import LooseVersion import pytest @@ -18,7 +17,7 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows, PY3) + is_platform_windows, PY3, reload) from pandas.io.common import URLError, urlopen, file_path_to_url import pandas.io.html from pandas.io.html import read_html From 478601c16ff553c5d95465c5e9ff7a6de54fb179 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 20:35:59 -0800 Subject: [PATCH 05/13] LINT fixes --- pandas/io/html.py | 3 +- pandas/tests/io/test_html.py | 68 ++++++------------------------------ 2 files changed, 11 insertions(+), 60 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6e9e61182ab48..fd795cd6837ae 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,8 +14,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError -from pandas.io.common import (_is_url, urlopen, - parse_url, _validate_header_arg) +from pandas.io.common import _is_url, urlopen, _validate_header_arg from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index e9539aa731a3b..27d04ed696885 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -4,7 +4,6 @@ import os import re import threading -import warnings from functools import partial from distutils.version import LooseVersion @@ -16,9 +15,9 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) -from pandas.compat import (map, zip, StringIO, string_types, BytesIO, +from pandas.compat import (map, zip, StringIO, BytesIO, is_platform_windows, PY3, reload) -from pandas.io.common import URLError, urlopen, file_path_to_url +from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html from pandas._libs.parsers import ParserError @@ -30,6 +29,7 @@ DATA_PATH = tm.get_data_path() + def assert_framelist_equal(list1, list2, *args, **kwargs): assert len(list1) == len(list2), ('lists are not of equal size ' 'len(list1) == {0}, ' @@ -43,6 +43,7 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, 'frames are both empty' + def _missing_bs4(): bs4 = td.safe_import('bs4') if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): @@ -50,6 +51,7 @@ def _missing_bs4(): return False + @td.skip_if_no('bs4') def test_bs4_version_fails(): import bs4 @@ -60,11 +62,13 @@ def test_bs4_version_fails(): else: pytest.skip("Only applicable for bs4 version 4.2.0") + def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): read_html(url, 'google', flavor='not a* valid**++ flaver') + @td.skip_if_no('bs4') @td.skip_if_no('lxml') def test_same_ordering(): @@ -647,6 +651,9 @@ def test_computer_sales_page(self): r"multi_index of columns"): self.read_html(data, header=[0, 1]) + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + assert self.read_html(data, header=[1, 2]) + def test_wikipedia_states_table(self): data = os.path.join(DATA_PATH, 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data @@ -654,39 +661,6 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ - (True, DataFrame(["foo"]), None), - (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) - def test_displayed_only(self, displayed_only, exp0, exp1): - # GH 20027 - data = StringIO(""" - - - - - -
- foo - bar - baz - qux -
- - - - -
foo
- - """) - - dfs = self.read_html(data, displayed_only=displayed_only) - tm.assert_frame_equal(dfs[0], exp0) - - if exp1 is not None: - tm.assert_frame_equal(dfs[1], exp1) - else: - assert len(dfs) == 1 # Should not parse hidden table - def test_decimal_rows(self): # GH 12907 @@ -825,27 +799,6 @@ def test_to_html_timestamp(self): result = df.to_html() assert '2000-01-01' in result - def test_parse_dates_list(self): - df = DataFrame({'date': date_range('1/1/2001', periods=10)}) - expected = df.to_html() - res = self.read_html(expected, parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=['date'], index_col=0) - tm.assert_frame_equal(df, res[0]) - - def test_parse_dates_combine(self): - raw_dates = Series(date_range('1/1/2001', periods=10)) - df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), - 'time': raw_dates.map(lambda x: str(x.time()))}) - res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, - index_col=1) - newdf = DataFrame({'datetime': raw_dates}) - tm.assert_frame_equal(newdf, res[0]) - - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') - self.read_html(data, header=[1, 2]) - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) @@ -949,7 +902,6 @@ def seekable(self): assert self.read_html(good) assert self.read_html(bad) - @pytest.mark.slow def test_importcheck_thread_safety(self): # see gh-16928 From 23602249b9a0b4f6ec2a9bf8b5b5411ee43719d4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 11 Mar 2018 21:01:47 -0700 Subject: [PATCH 06/13] Py27 compat --- pandas/io/html.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index fd795cd6837ae..5186b5fce41b1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -607,11 +607,12 @@ def _build_doc(self): parser = HTMLParser(recover=True, encoding=self.encoding) try: - _io = self.io - if _is_url(_io): - _io = urlopen(_io) - # try to parse the input in the simplest way - r = parse(_io, parser=parser) + if _is_url(self.io): + with urlopen(self.io) as f: + r = parse(f, parser=parser) + else: + # try to parse the input in the simplest way + r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: From a93a5a3b4a85cc2c12297789d8bc1a79575035cc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Mar 2018 23:51:57 -0700 Subject: [PATCH 07/13] Increased bs4 min version req --- ci/requirements-optional-conda.txt | 2 +- ci/requirements-optional-pip.txt | 2 +- pandas/io/html.py | 12 ++++-------- pandas/tests/io/test_html.py | 12 ++++-------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index 6edb8d17337e4..65357ce2018d2 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -1,4 +1,4 @@ -beautifulsoup4 +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 8d4421ba2b681..43c7d47892095 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -1,6 +1,6 @@ # This file was autogenerated by scripts/convert_deps.py # Do not modify directly -beautifulsoup4 +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet diff --git a/pandas/io/html.py b/pandas/io/html.py index 5186b5fce41b1..ba5da1b4e3a76 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -738,14 +738,10 @@ def _parser_dispatch(flavor): raise ImportError( "BeautifulSoup4 (bs4) not found, please install it") import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - raise ValueError("You're using a version" - " of BeautifulSoup4 (4.2.0) that has been" - " known to cause problems on certain" - " operating systems such as Debian. " - "Please install a version of" - " BeautifulSoup4 != 4.2.0, both earlier" - " and later releases will work.") + if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'): + raise ValueError("A minimum version of BeautifulSoup 4.2.1 " + "is required") + else: if not _HAS_LXML: raise ImportError("lxml not found, please install it") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 27d04ed696885..d40c45c969442 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -53,15 +53,11 @@ def _missing_bs4(): @td.skip_if_no('bs4') -def test_bs4_version_fails(): +def test_bs4_version_fails(monkeypatch): import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, - "spam.html"), - flavor='bs4') - else: - pytest.skip("Only applicable for bs4 version 4.2.0") - + monkeypatch.setattr(bs4, '__version__', '4.2') + with tm.assert_raises_regex(ValueError, "minimum version"): + read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') def test_invalid_flavor(): url = 'google.com' From 29904d1ca5dd270d7e3ff1bb0cea9630fa8a9b88 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Mar 2018 23:52:34 -0700 Subject: [PATCH 08/13] Removed xfail test for lxml --- pandas/tests/io/test_html.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index d40c45c969442..39b1f8bcea649 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -765,18 +765,6 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - @pytest.mark.xfail - def test_data_fail(self): - from lxml.etree import XMLSyntaxError - spam_data = os.path.join(DATA_PATH, 'spam.html') - banklist_data = os.path.join(DATA_PATH, 'banklist.html') - - with pytest.raises(XMLSyntaxError): - self.read_html(spam_data) - - with pytest.raises(XMLSyntaxError): - self.read_html(banklist_data) - def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs = self.read_html(filename, index_col=0) From f488fc8c07efaf94f35404597702a64d7099093d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Mar 2018 23:54:47 -0700 Subject: [PATCH 09/13] LINTing --- pandas/tests/io/test_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 39b1f8bcea649..0b021cc01beae 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -59,6 +59,7 @@ def test_bs4_version_fails(monkeypatch): with tm.assert_raises_regex(ValueError, "minimum version"): read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): From 41b77e1a22ea6eaea349c9218d701e7ba1dca206 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Mar 2018 10:56:34 -0700 Subject: [PATCH 10/13] Clean up unnecessary test --- pandas/tests/io/test_html.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 0b021cc01beae..f18fc2e91e266 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -44,14 +44,6 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): assert not frame_i.empty, 'frames are both empty' -def _missing_bs4(): - bs4 = td.safe_import('bs4') - if not bs4 or LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - return True - - return False - - @td.skip_if_no('bs4') def test_bs4_version_fails(monkeypatch): import bs4 @@ -77,7 +69,7 @@ def test_same_ordering(): @pytest.mark.parametrize("flavor", [ pytest.param('bs4', marks=pytest.mark.skipif( - _missing_bs4(), reason='No bs4')), + not td.safe_import('lxml'), reason='No bs4')), pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): From d44b164c967ec99da8dd9ae5a0d1fedca447c6ca Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Mar 2018 11:17:15 -0700 Subject: [PATCH 11/13] Updated documentation --- doc/source/install.rst | 9 ++++++--- doc/source/whatsnew/v0.23.0.txt | 16 +++++++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 07f57dbd65709..7d741c6c2c75a 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -266,6 +266,12 @@ Optional Dependencies * One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: + .. versionchanged:: 0.23.0 + + .. note:: + + If using BeautifulSoup4 a minimum version of 4.2.1 is required + * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is okay.) * `BeautifulSoup4`_ and `lxml`_ @@ -282,9 +288,6 @@ Optional Dependencies * You are highly encouraged to read :ref:`HTML Table Parsing gotchas `. It explains issues surrounding the installation and usage of the above three libraries. - * You may need to install an older version of `BeautifulSoup4`_: - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit - Ubuntu/Debian .. note:: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f686a042c1a74..c6683edef7272 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -358,13 +358,15 @@ Dependencies have increased minimum versions We have updated our minimum supported versions of dependencies (:issue:`15184`). If installed, we now require: -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| python-dateutil | 2.5.0 | X | -+-----------------+-----------------+----------+ -| openpyxl | 2.4.0 | | -+-----------------+-----------------+----------+ ++-----------------+-----------------+----------+---------------+ +| Package | Minimum Version | Required | Issue | ++=================+=================+==========+===============+ +| python-dateutil | 2.5.0 | X | :issue:`15184`| ++-----------------+-----------------+----------+---------------+ +| openpyxl | 2.4.0 | | :issue:`15184`| ++-----------------+-----------------+----------+---------------+ +| beautifulsoup4 | 4.2.1 | | :issue:`20082`| ++-----------------+-----------------+----------+---------------+ .. _whatsnew_0230.api_breaking.dict_insertion_order: From e6943b1a48ccd51fae4e60c3f232d1337b0ad8e2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Mar 2018 16:37:29 -0700 Subject: [PATCH 12/13] Bumped bs4 build req --- ci/requirements-2.7_COMPAT.pip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_COMPAT.pip b/ci/requirements-2.7_COMPAT.pip index 13cd35a923124..0e154dbc07525 100644 --- a/ci/requirements-2.7_COMPAT.pip +++ b/ci/requirements-2.7_COMPAT.pip @@ -1,4 +1,4 @@ html5lib==1.0b2 -beautifulsoup4==4.2.0 +beautifulsoup4==4.2.1 openpyxl argparse From 50d072dbdce2bd44d1b83044743c1e511f7c1f25 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Mar 2018 16:38:32 -0700 Subject: [PATCH 13/13] LINT fix --- pandas/tests/io/test_html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f18fc2e91e266..79b9a3715efd2 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -6,7 +6,6 @@ import threading from functools import partial -from distutils.version import LooseVersion import pytest