pandas-dev · mroeschke · Mar 28, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -208,6 +208,7 @@ I/O
 ^^^
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
+- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 -
 
 Period

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -582,7 +582,6 @@ def __init__(self, *args, **kwargs) -> None:
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
-
         if not tables:
             raise ValueError("No tables found")
 
@@ -592,13 +591,15 @@ def _parse_tables(self, doc, match, attrs):
 
         for table in tables:
             if self.displayed_only:
+                for elem in table.find_all("style"):
+                    elem.decompose()
+
                 for elem in table.find_all(style=re.compile(r"display:\s*none")):
                     elem.decompose()
 
             if table not in unique_tables and table.find(string=match) is not None:
                 result.append(table)
             unique_tables.add(table)
-
         if not result:
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
@@ -730,10 +731,11 @@ def _parse_tables(self, doc, match, kwargs):
                 # lxml utilizes XPATH 1.0 which does not have regex
                 # support. As a result, we find all elements with a style
                 # attribute and iterate them to check for display:none
+                for elem in table.xpath(".//style"):
+                    elem.drop_tree()
                 for elem in table.xpath(".//*[@style]"):
                     if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
                         elem.drop_tree()
-
         if not tables:
             raise ValueError(f"No tables found matching regex {repr(pattern)}")
         return tables
@@ -1170,6 +1172,7 @@ def read_html(
             '{None, "header", "footer", "body", "all"}, got '
             f'"{extract_links}"'
         )
+
     validate_header_arg(header)
     check_dtype_backend(dtype_backend)
 

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1495,3 +1495,28 @@ def test_invalid_dtype_backend(self):
         )
         with pytest.raises(ValueError, match=msg):
             read_html("test", dtype_backend="numpy")
+
+    def test_style_tag(self):
+        # GH 48316
+        data = """
+        <table>
+            <tr>
+                <th>
+                    <style>.style</style>
+                    A
+                    </th>
+                <th>B</th>
+            </tr>
+            <tr>
+                <td>A1</td>
+                <td>B1</td>
+            </tr>
+            <tr>
+                <td>A2</td>
+                <td>B2</td>
+            </tr>
+        </table>
+        """
+        result = self.read_html(data)[0]
+        expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
+        tm.assert_frame_equal(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -208,6 +208,7 @@ I/O @@
     ^^^
     - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
     - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
+    - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
     -
     Period
@@ Expand Down @@