From 85bc22ac9ac352190f0f677725eedd2845b61daf Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 4 Aug 2021 14:02:53 -0700 Subject: [PATCH 1/8] Revert "Fix bug on master (#38987)" This reverts commit 68db2d26ddb5f95de4254d61b850d3dcaf6ce717. --- doc/source/user_guide/io.rst | 12 +++++++----- pandas/tests/io/test_html.py | 2 -- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 939fd5b832cef..d4050c4d93d53 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2464,14 +2464,16 @@ Read a URL with no options: .. ipython:: python - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/html/spam.html" - ) + url = "https://www.fdic.gov/bank/individual/failed/banklist.html" dfs = pd.read_html(url) dfs -Read in the content of the "banklist.html" file and pass it to ``read_html`` +.. note:: + + The data from the above URL changes every Monday so the resulting data above + and the data below may be slightly different. + +Read in the content of the file from the above URL and pass it to ``read_html`` as a string: .. ipython:: python diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f842e4cd58863..7c7dff0b73390 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -134,7 +134,6 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) - @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url_positional_match(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" @@ -148,7 +147,6 @@ def test_banklist_url_positional_match(self): assert_framelist_equal(df1, df2) - @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" From 71fcb01799dd60d478c54ae90cddc8ceb98e555d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 4 Aug 2021 14:05:09 -0700 Subject: [PATCH 2/8] Update io.rst --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4050c4d93d53..b5ddfc0e8a652 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2464,7 +2464,7 @@ Read a URL with no options: .. ipython:: python - url = "https://www.fdic.gov/bank/individual/failed/banklist.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" dfs = pd.read_html(url) dfs From 2d9506617be0cbc36b40dbf9b07ccf5d4da77cae Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 4 Aug 2021 14:06:32 -0700 Subject: [PATCH 3/8] Update test_html.py --- pandas/tests/io/test_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 7c7dff0b73390..632fbaf142029 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -136,7 +136,7 @@ def test_to_html_compat(self): @tm.network def test_banklist_url_positional_match(self): - url = "https://www.fdic.gov/bank/individual/failed/banklist.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( @@ -149,7 +149,7 @@ def test_banklist_url_positional_match(self): @tm.network def test_banklist_url(self): - url = "https://www.fdic.gov/bank/individual/failed/banklist.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" df1 = self.read_html( url, match="First Federal Bank of Florida", attrs={"id": "table"} ) From 91f2763e8e6b165380e86760b18cbfe8e38f4a19 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 4 Aug 2021 15:23:33 -0700 Subject: [PATCH 4/8] Update test_html.py --- pandas/tests/io/test_html.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 632fbaf142029..eb98b07acc175 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -136,24 +136,24 @@ def test_to_html_compat(self): @tm.network def test_banklist_url_positional_match(self): - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( - url, "First Federal Bank of Florida", attrs={"id": "table"} + url, "First Federal Bank of Florida", attrs={"class": "dataTable"} ) with tm.assert_produces_warning(FutureWarning): - df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) + df2 = self.read_html(url, "Metcalf Bank", attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) @tm.network def test_banklist_url(self): - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 df1 = self.read_html( - url, match="First Federal Bank of Florida", attrs={"id": "table"} + url, match="First Federal Bank of Florida", attrs={"class": "dataTable"} ) - df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"}) + df2 = self.read_html(url, match="Metcalf Bank", attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) From 76e15e5ba22d5ccf9ea9a4e51ef18287734b182b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 6 Aug 2021 13:09:24 -0700 Subject: [PATCH 5/8] try http instead of https? --- pandas/tests/io/test_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index eb98b07acc175..a2b05ace03bda 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -136,7 +136,7 @@ def test_to_html_compat(self): @tm.network def test_banklist_url_positional_match(self): - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( @@ -149,7 +149,7 @@ def test_banklist_url_positional_match(self): @tm.network def test_banklist_url(self): - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 df1 = self.read_html( url, match="First Federal Bank of Florida", attrs={"class": "dataTable"} ) From 077901bcd8f70d7a654e123f5ed0e3f18552de24 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 31 Aug 2021 20:17:42 -0700 Subject: [PATCH 6/8] Update test_html.py --- pandas/tests/io/test_html.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a2b05ace03bda..b23efd503ffb7 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -140,10 +140,12 @@ def test_banklist_url_positional_match(self): # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( - url, "First Federal Bank of Florida", attrs={"class": "dataTable"} + # lxml cannot find attrs leave out for now + url, "First Federal Bank of Florida", # attrs={"class": "dataTable"} ) with tm.assert_produces_warning(FutureWarning): - df2 = self.read_html(url, "Metcalf Bank", attrs={"class": "dataTable"}) + # lxml cannot find attrs leave out for now + df2 = self.read_html(url, "Metcalf Bank",) #attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) @@ -151,9 +153,11 @@ def test_banklist_url_positional_match(self): def test_banklist_url(self): url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 df1 = self.read_html( - url, match="First Federal Bank of Florida", attrs={"class": "dataTable"} + # lxml cannot find attrs leave out for now + url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) - df2 = self.read_html(url, match="Metcalf Bank", attrs={"class": "dataTable"}) + # lxml cannot find attrs leave out for now + df2 = self.read_html(url, match="Metcalf Bank",) #attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) From d80f47db1d62417ff4d49be7be488840a18f2780 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 1 Sep 2021 04:36:02 +0000 Subject: [PATCH 7/8] Fixes from pre-commit [automated commit] --- pandas/tests/io/test_html.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b23efd503ffb7..1363a0b04ee0a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -136,28 +136,36 @@ def test_to_html_compat(self): @tm.network def test_banklist_url_positional_match(self): - url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( # lxml cannot find attrs leave out for now - url, "First Federal Bank of Florida", # attrs={"class": "dataTable"} + url, + "First Federal Bank of Florida", # attrs={"class": "dataTable"} ) with tm.assert_produces_warning(FutureWarning): # lxml cannot find attrs leave out for now - df2 = self.read_html(url, "Metcalf Bank",) #attrs={"class": "dataTable"}) + df2 = self.read_html( + url, + "Metcalf Bank", + ) # attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) @tm.network def test_banklist_url(self): - url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 df1 = self.read_html( # lxml cannot find attrs leave out for now - url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} + url, + match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) # lxml cannot find attrs leave out for now - df2 = self.read_html(url, match="Metcalf Bank",) #attrs={"class": "dataTable"}) + df2 = self.read_html( + url, + match="Metcalf Bank", + ) # attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) From d3a3df781000d63edf5ecefd4c8acba071d876d3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 31 Aug 2021 21:49:59 -0700 Subject: [PATCH 8/8] lint --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4d96638b0dcaa..4c7b13bcf989f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2502,7 +2502,7 @@ Read a URL with no options: .. ipython:: python - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" dfs = pd.read_html(url) dfs