diff --git a/ci/requirements-2.7_COMPAT.pip b/ci/requirements-2.7_COMPAT.pip index 13cd35a923124..0e154dbc07525 100644 --- a/ci/requirements-2.7_COMPAT.pip +++ b/ci/requirements-2.7_COMPAT.pip @@ -1,4 +1,4 @@ html5lib==1.0b2 -beautifulsoup4==4.2.0 +beautifulsoup4==4.2.1 openpyxl argparse diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index 6edb8d17337e4..65357ce2018d2 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -1,4 +1,4 @@ -beautifulsoup4 +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 8d4421ba2b681..43c7d47892095 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -1,6 +1,6 @@ # This file was autogenerated by scripts/convert_deps.py # Do not modify directly -beautifulsoup4 +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet diff --git a/doc/source/install.rst b/doc/source/install.rst index 07f57dbd65709..7d741c6c2c75a 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -266,6 +266,12 @@ Optional Dependencies * One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: + .. versionchanged:: 0.23.0 + + .. note:: + + If using BeautifulSoup4 a minimum version of 4.2.1 is required + * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is okay.) * `BeautifulSoup4`_ and `lxml`_ @@ -282,9 +288,6 @@ Optional Dependencies * You are highly encouraged to read :ref:`HTML Table Parsing gotchas `. It explains issues surrounding the installation and usage of the above three libraries. - * You may need to install an older version of `BeautifulSoup4`_: - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit - Ubuntu/Debian .. note:: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 791365295c268..c08e22af295f4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -358,13 +358,15 @@ Dependencies have increased minimum versions We have updated our minimum supported versions of dependencies (:issue:`15184`). If installed, we now require: -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| python-dateutil | 2.5.0 | X | -+-----------------+-----------------+----------+ -| openpyxl | 2.4.0 | | -+-----------------+-----------------+----------+ ++-----------------+-----------------+----------+---------------+ +| Package | Minimum Version | Required | Issue | ++=================+=================+==========+===============+ +| python-dateutil | 2.5.0 | X | :issue:`15184`| ++-----------------+-----------------+----------+---------------+ +| openpyxl | 2.4.0 | | :issue:`15184`| ++-----------------+-----------------+----------+---------------+ +| beautifulsoup4 | 4.2.1 | | :issue:`20082`| ++-----------------+-----------------+----------+---------------+ .. _whatsnew_0230.api_breaking.dict_insertion_order: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 78aaf4596c8b7..aefa1ddd6cf0b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -131,6 +131,9 @@ def lmap(*args, **kwargs): def lfilter(*args, **kwargs): return list(filter(*args, **kwargs)) + from importlib import reload + reload = reload + else: # Python 2 import re @@ -184,6 +187,7 @@ def get_range_parameters(data): lmap = builtins.map lfilter = builtins.filter + reload = builtins.reload if PY2: def iteritems(obj, **kw): diff --git a/pandas/io/html.py b/pandas/io/html.py index 300a5a151f5d2..ba5da1b4e3a76 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,8 +14,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError -from pandas.io.common import (_is_url, urlopen, - parse_url, _validate_header_arg) +from pandas.io.common import _is_url, urlopen, _validate_header_arg from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) @@ -554,8 +553,7 @@ def _parse_td(self, row): return row.xpath('.//td|.//th') def _parse_tr(self, table): - expr = './/tr[normalize-space()]' - return table.xpath(expr) + return table.xpath('.//tr') def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -606,18 +604,20 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError - - parser = HTMLParser(recover=False, encoding=self.encoding) + parser = HTMLParser(recover=True, encoding=self.encoding) try: - # try to parse the input in the simplest way - r = parse(self.io, parser=parser) - + if _is_url(self.io): + with urlopen(self.io) as f: + r = parse(f, parser=parser) + else: + # try to parse the input in the simplest way + r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass - except (UnicodeDecodeError, IOError): + except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) @@ -627,17 +627,7 @@ def _build_doc(self): except AttributeError: pass else: - # not a url - scheme = parse_url(self.io).scheme - if scheme not in _valid_schemes: - # lxml can't parse it - msg = (('{invalid!r} is not a valid url scheme, valid ' - 'schemes are {valid}') - .format(invalid=scheme, valid=_valid_schemes)) - raise ValueError(msg) - else: - # something else happened: maybe a faulty connection - raise + raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) @@ -657,12 +647,21 @@ def _parse_raw_thead(self, table): thead = table.xpath(expr) res = [] if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + # Grab any directly descending table headers first + ths = thead[0].xpath('./th') + if ths: + cols = [_remove_whitespace(x.text_content()) for x in ths] if any(col != '' for col in cols): res.append(cols) + else: + trs = self._parse_tr(thead[0]) + + for tr in trs: + cols = [_remove_whitespace(x.text_content()) for x in + self._parse_td(tr)] + + if any(col != '' for col in cols): + res.append(cols) return res def _parse_raw_tfoot(self, table): @@ -739,14 +738,10 @@ def _parser_dispatch(flavor): raise ImportError( "BeautifulSoup4 (bs4) not found, please install it") import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - raise ValueError("You're using a version" - " of BeautifulSoup4 (4.2.0) that has been" - " known to cause problems on certain" - " operating systems such as Debian. " - "Please install a version of" - " BeautifulSoup4 != 4.2.0, both earlier" - " and later releases will work.") + if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'): + raise ValueError("A minimum version of BeautifulSoup 4.2.1 " + "is required") + else: if not _HAS_LXML: raise ImportError("lxml not found, please install it") diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/banklist.html index cbcce5a2d49ff..c6f0e47c2a3ef 100644 --- a/pandas/tests/io/data/banklist.html +++ b/pandas/tests/io/data/banklist.html @@ -340,6 +340,7 @@

Failed Bank List

April 19, 2013 April 23, 2013 + Gold Canyon Bank Gold Canyon AZ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b18104e951504..79b9a3715efd2 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -4,17 +4,8 @@ import os import re import threading -import warnings - -# imports needed for Python 3.x but will fail under Python 2.x -try: - from importlib import import_module, reload -except ImportError: - import_module = __import__ - - -from distutils.version import LooseVersion +from functools import partial import pytest @@ -23,48 +14,18 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) -from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows, PY3) -from pandas.io.common import URLError, urlopen, file_path_to_url +from pandas.compat import (map, zip, StringIO, BytesIO, + is_platform_windows, PY3, reload) +from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html from pandas._libs.parsers import ParserError import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network -def _have_module(module_name): - try: - import_module(module_name) - return True - except ImportError: - return False - - -def _skip_if_no(module_name): - if not _have_module(module_name): - pytest.skip("{0!r} not found".format(module_name)) - - -def _skip_if_none_of(module_names): - if isinstance(module_names, string_types): - _skip_if_no(module_names) - if module_names == 'bs4': - import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - pytest.skip("Bad version of bs4: 4.2.0") - else: - not_found = [module_name for module_name in module_names if not - _have_module(module_name)] - if set(not_found) & set(module_names): - pytest.skip("{0!r} not found".format(not_found)) - if 'bs4' in module_names: - import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - pytest.skip("Bad version of bs4: 4.2.0") - - DATA_PATH = tm.get_data_path() @@ -82,33 +43,45 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): assert not frame_i.empty, 'frames are both empty' -def test_bs4_version_fails(): - _skip_if_none_of(('bs4', 'html5lib')) +@td.skip_if_no('bs4') +def test_bs4_version_fails(monkeypatch): import bs4 - if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'): - tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, - "spam.html"), - flavor='bs4') + monkeypatch.setattr(bs4, '__version__', '4.2') + with tm.assert_raises_regex(ValueError, "minimum version"): + read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') -class ReadHtmlMixin(object): +def test_invalid_flavor(): + url = 'google.com' + with pytest.raises(ValueError): + read_html(url, 'google', flavor='not a* valid**++ flaver') - def read_html(self, *args, **kwargs): - kwargs.setdefault('flavor', self.flavor) - return read_html(*args, **kwargs) + +@td.skip_if_no('bs4') +@td.skip_if_no('lxml') +def test_same_ordering(): + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) + dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + assert_framelist_equal(dfs_lxml, dfs_bs4) -class TestReadHtml(ReadHtmlMixin): - flavor = 'bs4' +@pytest.mark.parametrize("flavor", [ + pytest.param('bs4', marks=pytest.mark.skipif( + not td.safe_import('lxml'), reason='No bs4')), + pytest.param('lxml', marks=pytest.mark.skipif( + not td.safe_import('lxml'), reason='No lxml'))], scope="class") +class TestReadHtml(object): spam_data = os.path.join(DATA_PATH, 'spam.html') spam_data_kwargs = {} if PY3: spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') - @classmethod - def setup_class(cls): - _skip_if_none_of(('bs4', 'html5lib')) + @pytest.fixture(autouse=True, scope="function") + def set_defaults(self, flavor, request): + self.read_html = partial(read_html, flavor=flavor) + yield def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, @@ -150,7 +123,6 @@ def test_spam_no_types(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' assert df1[0].columns[0] == 'Nutrient' @@ -667,6 +639,9 @@ def test_computer_sales_page(self): r"multi_index of columns"): self.read_html(data, header=[0, 1]) + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + assert self.read_html(data, header=[1, 2]) + def test_wikipedia_states_table(self): data = os.path.join(DATA_PATH, 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data @@ -674,39 +649,6 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ - (True, DataFrame(["foo"]), None), - (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) - def test_displayed_only(self, displayed_only, exp0, exp1): - # GH 20027 - data = StringIO(""" - - - - - -
- foo - bar - baz - qux -
- - - - -
foo
- - """) - - dfs = self.read_html(data, displayed_only=displayed_only) - tm.assert_frame_equal(dfs[0], exp0) - - if exp1 is not None: - tm.assert_frame_equal(dfs[1], exp1) - else: - assert len(dfs) == 1 # Should not parse hidden table - def test_decimal_rows(self): # GH 12907 @@ -815,80 +757,6 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - -def _lang_enc(filename): - return os.path.splitext(os.path.basename(filename))[0].split('_') - - -class TestReadHtmlEncoding(object): - files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html')) - flavor = 'bs4' - - @classmethod - def setup_class(cls): - _skip_if_none_of((cls.flavor, 'html5lib')) - - def read_html(self, *args, **kwargs): - kwargs['flavor'] = self.flavor - return read_html(*args, **kwargs) - - def read_filename(self, f, encoding): - return self.read_html(f, encoding=encoding, index_col=0) - - def read_file_like(self, f, encoding): - with open(f, 'rb') as fobj: - return self.read_html(BytesIO(fobj.read()), encoding=encoding, - index_col=0) - - def read_string(self, f, encoding): - with open(f, 'rb') as fobj: - return self.read_html(fobj.read(), encoding=encoding, index_col=0) - - def test_encode(self): - assert self.files, 'no files read from the data folder' - for f in self.files: - _, encoding = _lang_enc(f) - try: - from_string = self.read_string(f, encoding).pop() - from_file_like = self.read_file_like(f, encoding).pop() - from_filename = self.read_filename(f, encoding).pop() - tm.assert_frame_equal(from_string, from_file_like) - tm.assert_frame_equal(from_string, from_filename) - except Exception: - # seems utf-16/32 fail on windows - if is_platform_windows(): - if '16' in encoding or '32' in encoding: - continue - raise - - -class TestReadHtmlEncodingLxml(TestReadHtmlEncoding): - flavor = 'lxml' - - @classmethod - def setup_class(cls): - super(TestReadHtmlEncodingLxml, cls).setup_class() - _skip_if_no(cls.flavor) - - -class TestReadHtmlLxml(ReadHtmlMixin): - flavor = 'lxml' - - @classmethod - def setup_class(cls): - _skip_if_no('lxml') - - def test_data_fail(self): - from lxml.etree import XMLSyntaxError - spam_data = os.path.join(DATA_PATH, 'spam.html') - banklist_data = os.path.join(DATA_PATH, 'banklist.html') - - with pytest.raises(XMLSyntaxError): - self.read_html(spam_data) - - with pytest.raises(XMLSyntaxError): - self.read_html(banklist_data) - def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs = self.read_html(filename, index_col=0) @@ -897,7 +765,6 @@ def test_works_on_valid_markup(self): @pytest.mark.slow def test_fallback_success(self): - _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) @@ -908,27 +775,6 @@ def test_to_html_timestamp(self): result = df.to_html() assert '2000-01-01' in result - def test_parse_dates_list(self): - df = DataFrame({'date': date_range('1/1/2001', periods=10)}) - expected = df.to_html() - res = self.read_html(expected, parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=['date'], index_col=0) - tm.assert_frame_equal(df, res[0]) - - def test_parse_dates_combine(self): - raw_dates = Series(date_range('1/1/2001', periods=10)) - df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), - 'time': raw_dates.map(lambda x: str(x.time()))}) - res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, - index_col=1) - newdf = DataFrame({'datetime': raw_dates}) - tm.assert_frame_equal(newdf, res[0]) - - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') - self.read_html(data, header=[0, 1]) - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) @@ -962,134 +808,99 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table + @pytest.mark.parametrize("f", glob.glob( + os.path.join(DATA_PATH, 'html_encoding', '*.html'))) + def test_encode(self, f): + _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') -def test_invalid_flavor(): - url = 'google.com' - with pytest.raises(ValueError): - read_html(url, 'google', flavor='not a* valid**++ flaver') - - -def get_elements_from_file(url, element='table'): - _skip_if_none_of(('bs4', 'html5lib')) - url = file_path_to_url(url) - from bs4 import BeautifulSoup - with urlopen(url) as f: - soup = BeautifulSoup(f, features='html5lib') - return soup.find_all(element) - - -@pytest.mark.slow -def test_bs4_finds_tables(): - filepath = os.path.join(DATA_PATH, "spam.html") - with warnings.catch_warnings(): - warnings.filterwarnings('ignore') - assert get_elements_from_file(filepath, 'table') - - -def get_lxml_elements(url, element): - _skip_if_no('lxml') - from lxml.html import parse - doc = parse(url) - return doc.xpath('.//{0}'.format(element)) - - -@pytest.mark.slow -def test_lxml_finds_tables(): - filepath = os.path.join(DATA_PATH, "spam.html") - assert get_lxml_elements(filepath, 'table') - - -@pytest.mark.slow -def test_lxml_finds_tbody(): - filepath = os.path.join(DATA_PATH, "spam.html") - assert get_lxml_elements(filepath, 'tbody') - - -def test_same_ordering(): - _skip_if_none_of(['bs4', 'lxml', 'html5lib']) - filename = os.path.join(DATA_PATH, 'valid_markup.html') - dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) - dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) - assert_framelist_equal(dfs_lxml, dfs_bs4) - - -class ErrorThread(threading.Thread): - def run(self): try: - super(ErrorThread, self).run() - except Exception as e: - self.err = e - else: - self.err = None + with open(f, 'rb') as fobj: + from_string = self.read_html(fobj.read(), encoding=encoding, + index_col=0).pop() + with open(f, 'rb') as fobj: + from_file_like = self.read_html(BytesIO(fobj.read()), + encoding=encoding, + index_col=0).pop() -@pytest.mark.slow -def test_importcheck_thread_safety(): - # see gh-16928 + from_filename = self.read_html(f, encoding=encoding, + index_col=0).pop() + tm.assert_frame_equal(from_string, from_file_like) + tm.assert_frame_equal(from_string, from_filename) + except Exception: + # seems utf-16/32 fail on windows + if is_platform_windows(): + if '16' in encoding or '32' in encoding: + pytest.skip() + raise - # force import check by reinitalising global vars in html.py - pytest.importorskip('lxml') - reload(pandas.io.html) + def test_parse_failure_unseekable(self): + # Issue #17975 - filename = os.path.join(DATA_PATH, 'valid_markup.html') - helper_thread1 = ErrorThread(target=read_html, args=(filename,)) - helper_thread2 = ErrorThread(target=read_html, args=(filename,)) + if self.read_html.keywords.get('flavor') == 'lxml': + pytest.skip("Not applicable for lxml") - helper_thread1.start() - helper_thread2.start() + class UnseekableStringIO(StringIO): + def seekable(self): + return False - while helper_thread1.is_alive() or helper_thread2.is_alive(): - pass - assert None is helper_thread1.err is helper_thread2.err + bad = UnseekableStringIO(''' +
spameggs
''') + assert self.read_html(bad) -def test_parse_failure_unseekable(): - # Issue #17975 - _skip_if_no('lxml') - _skip_if_no('bs4') + with pytest.raises(ValueError, + match='passed a non-rewindable file object'): + self.read_html(bad) - class UnseekableStringIO(StringIO): - def seekable(self): - return False + def test_parse_failure_rewinds(self): + # Issue #17975 - good = UnseekableStringIO(''' -
spam
eggs
''') - bad = UnseekableStringIO(''' -
spameggs
''') + class MockFile(object): + def __init__(self, data): + self.data = data + self.at_end = False - assert read_html(good) - assert read_html(bad, flavor='bs4') + def read(self, size=None): + data = '' if self.at_end else self.data + self.at_end = True + return data - bad.seek(0) + def seek(self, offset): + self.at_end = False - with pytest.raises(ValueError, - match='passed a non-rewindable file object'): - read_html(bad) + def seekable(self): + return True + good = MockFile('
spam
eggs
') + bad = MockFile('
spameggs
') -def test_parse_failure_rewinds(): - # Issue #17975 - _skip_if_no('lxml') - _skip_if_no('bs4') + assert self.read_html(good) + assert self.read_html(bad) - class MockFile(object): - def __init__(self, data): - self.data = data - self.at_end = False + @pytest.mark.slow + def test_importcheck_thread_safety(self): + # see gh-16928 - def read(self, size=None): - data = '' if self.at_end else self.data - self.at_end = True - return data + class ErrorThread(threading.Thread): + def run(self): + try: + super(ErrorThread, self).run() + except Exception as e: + self.err = e + else: + self.err = None - def seek(self, offset): - self.at_end = False + # force import check by reinitalising global vars in html.py + reload(pandas.io.html) - def seekable(self): - return True + filename = os.path.join(DATA_PATH, 'valid_markup.html') + helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) - good = MockFile('
spam
eggs
') - bad = MockFile('
spameggs
') + helper_thread1.start() + helper_thread2.start() - assert read_html(good) - assert read_html(bad) + while helper_thread1.is_alive() or helper_thread2.is_alive(): + pass + assert None is helper_thread1.err is helper_thread2.err