From 47dedf452e16d3d03a9666adc703b8e06500dbb4 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Tue, 9 Mar 2021 20:49:27 -0600
Subject: [PATCH 1/5] TYP: Add typing for remaining IO XML methods with
 conditional for lxml

---
 pandas/io/formats/xml.py |   4 +-
 pandas/io/xml.py         | 110 ++++++++++++++++++++++-----------------
 2 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index 6987da63eaf3e..5c7255d5e6ee4 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -288,7 +288,7 @@ class EtreeXMLFormatter(BaseXMLFormatter):
     modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
         self.validate_columns()
@@ -452,7 +452,7 @@ class LxmlXMLFormatter(BaseXMLFormatter):
     modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
         self.validate_columns()
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 83eba5f17c7b3..8be3392062228 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -37,6 +37,8 @@
 )
 from pandas.io.parsers import TextParser
 
+lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
 
 class _XMLFrameParser:
     """
@@ -90,7 +92,6 @@ class _XMLFrameParser:
     To subclass this class effectively you must override the following methods:`
         * :func:`parse_data`
         * :func:`_parse_nodes`
-        * :func:`_parse_doc`
         * :func:`_validate_names`
         * :func:`_validate_path`
 
@@ -111,7 +112,7 @@ def __init__(
         stylesheet,
         compression,
         storage_options,
-    ):
+    ) -> None:
         self.path_or_buffer = path_or_buffer
         self.xpath = xpath
         self.namespaces = namespaces
@@ -187,16 +188,6 @@ def _validate_names(self) -> None:
         """
         raise AbstractMethodError(self)
 
-    def _parse_doc(self):
-        """
-        Build tree from io.
-
-        This method will parse io object into tree for parsing
-        conditionally by its specific object type.
-        """
-
-        raise AbstractMethodError(self)
-
 
 class _EtreeFrameParser(_XMLFrameParser):
     """
@@ -209,7 +200,7 @@ class _EtreeFrameParser(_XMLFrameParser):
         ElementTree,
     )
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
     def parse_data(self) -> List[Dict[str, Optional[str]]]:
@@ -357,6 +348,12 @@ def _validate_names(self) -> None:
                 )
 
     def _parse_doc(self) -> Union[Element, ElementTree]:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
         from xml.etree.ElementTree import (
             XMLParser,
             parse,
@@ -383,7 +380,7 @@ class _LxmlFrameParser(_XMLFrameParser):
     XPath 1.0 and XSLT 1.0.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
     def parse_data(self) -> List[Dict[str, Optional[str]]]:
@@ -491,21 +488,6 @@ def _parse_nodes(self) -> List[Dict[str, Optional[str]]]:
 
         return dicts
 
-    def _transform_doc(self):
-        """
-        Transform original tree using stylesheet.
-
-        This method will transform original xml using XSLT script into
-        am ideally flatter xml document for easier parsing and migration
-        to Data Frame.
-        """
-        from lxml.etree import XSLT
-
-        transformer = XSLT(self.xsl_doc)
-        new_doc = transformer(self.xml_doc)
-
-        return new_doc
-
     def _validate_path(self) -> None:
 
         msg = (
@@ -553,31 +535,62 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self, raw_doc):
+    if lxml is not None:
         from lxml.etree import (
-            XMLParser,
-            fromstring,
-            parse,
+            Element,
+            ElementTree,
         )
 
-        handle_data = get_data_from_filepath(
-            filepath_or_buffer=raw_doc,
-            encoding=self.encoding,
-            compression=self.compression,
-            storage_options=self.storage_options,
-        )
+        def _parse_doc(self, raw_doc) -> Union[Element, ElementTree]:
+            """
+            Build tree from path_or_buffer.
 
-        with preprocess_data(handle_data) as xml_data:
-            curr_parser = XMLParser(encoding=self.encoding)
+            This method will parse XML object into tree
+            either from string/bytes or file location.
+            """
 
-            if isinstance(xml_data, io.StringIO):
-                doc = fromstring(
-                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
-                )
-            else:
-                doc = parse(xml_data, parser=curr_parser)
+            from lxml.etree import (
+                XMLParser,
+                fromstring,
+                parse,
+            )
+
+            handle_data = get_data_from_filepath(
+                filepath_or_buffer=raw_doc,
+                encoding=self.encoding,
+                compression=self.compression,
+                storage_options=self.storage_options,
+            )
+
+            with preprocess_data(handle_data) as xml_data:
+                curr_parser = XMLParser(encoding=self.encoding)
+
+                if isinstance(xml_data, io.StringIO):
+                    doc = fromstring(
+                        xml_data.getvalue().encode(self.encoding), parser=curr_parser
+                    )
+                else:
+                    doc = parse(xml_data, parser=curr_parser)
+
+            return doc
+
+        def _transform_doc(self) -> Element:
+            """
+            Transform original tree using stylesheet.
+
+            This method will transform original xml using XSLT script into
+            am ideally flatter xml document for easier parsing and migration
+            to Data Frame.
+            """
+            from lxml.etree import (
+                XML,
+                XSLT,
+            )
+
+            transformer = XSLT(self.xsl_doc)
+            new_doc = transformer(self.xml_doc)
 
-        return doc
+            return XML(bytes(new_doc))
 
 
 def get_data_from_filepath(
@@ -694,7 +707,6 @@ def _parse(
         * If parser is not lxml or etree.
     """
 
-    lxml = import_optional_dependency("lxml.etree", errors="ignore")
     p: Union[_EtreeFrameParser, _LxmlFrameParser]
 
     if parser == "lxml":

From 9c62e6e924c98cc5fdcb15e18e84e1e008b3a7a4 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Wed, 10 Mar 2021 07:24:39 -0600
Subject: [PATCH 2/5] Refactor parse_doc method for bytes type hinting and
 avoid optional dependency import

---
 pandas/io/xml.py | 117 +++++++++++++++++++++--------------------------
 1 file changed, 52 insertions(+), 65 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 8be3392062228..e547588052548 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -37,8 +37,6 @@
 )
 from pandas.io.parsers import TextParser
 
-lxml = import_optional_dependency("lxml.etree", errors="ignore")
-
 
 class _XMLFrameParser:
     """
@@ -92,6 +90,7 @@ class _XMLFrameParser:
     To subclass this class effectively you must override the following methods:`
         * :func:`parse_data`
         * :func:`_parse_nodes`
+        * :func:`_parse_doc`
         * :func:`_validate_names`
         * :func:`_validate_path`
 
@@ -188,6 +187,15 @@ def _validate_names(self) -> None:
         """
         raise AbstractMethodError(self)
 
+    def _parse_doc(self) -> bytes:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
+        raise AbstractMethodError(self)
+
 
 class _EtreeFrameParser(_XMLFrameParser):
     """
@@ -195,22 +203,18 @@ class _EtreeFrameParser(_XMLFrameParser):
     standard library XML module: `xml.etree.ElementTree`.
     """
 
-    from xml.etree.ElementTree import (
-        Element,
-        ElementTree,
-    )
-
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
     def parse_data(self) -> List[Dict[str, Optional[str]]]:
+        from xml.etree.ElementTree import XML
 
         if self.stylesheet is not None:
             raise ValueError(
                 "To use stylesheet, you need lxml installed and selected as parser."
             )
 
-        self.xml_doc = self._parse_doc()
+        self.xml_doc = XML(self._parse_doc())
 
         self._validate_path()
         self._validate_names()
@@ -347,16 +351,11 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self) -> Union[Element, ElementTree]:
-        """
-        Build tree from path_or_buffer.
-
-        This method will parse XML object into tree
-        either from string/bytes or file location.
-        """
+    def _parse_doc(self) -> bytes:
         from xml.etree.ElementTree import (
             XMLParser,
             parse,
+            tostring,
         )
 
         handle_data = get_data_from_filepath(
@@ -370,7 +369,7 @@ def _parse_doc(self) -> Union[Element, ElementTree]:
             curr_parser = XMLParser(encoding=self.encoding)
             r = parse(xml_data, parser=curr_parser)
 
-        return r
+        return tostring(r.getroot())
 
 
 class _LxmlFrameParser(_XMLFrameParser):
@@ -391,12 +390,13 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]:
         validate xpath, names, optionally parse and run XSLT,
         and parse original or transformed XML and return specific nodes.
         """
+        from lxml.etree import XML
 
-        self.xml_doc = self._parse_doc(self.path_or_buffer)
+        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
 
         if self.stylesheet is not None:
-            self.xsl_doc = self._parse_doc(self.stylesheet)
-            self.xml_doc = self._transform_doc()
+            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
+            self.xml_doc = XML(self._transform_doc())
 
         self._validate_path()
         self._validate_names()
@@ -535,62 +535,47 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    if lxml is not None:
+    def _parse_doc(self, raw_doc) -> bytes:
         from lxml.etree import (
-            Element,
-            ElementTree,
+            XMLParser,
+            fromstring,
+            parse,
+            tostring,
         )
 
-        def _parse_doc(self, raw_doc) -> Union[Element, ElementTree]:
-            """
-            Build tree from path_or_buffer.
+        handle_data = get_data_from_filepath(
+            filepath_or_buffer=raw_doc,
+            encoding=self.encoding,
+            compression=self.compression,
+            storage_options=self.storage_options,
+        )
 
-            This method will parse XML object into tree
-            either from string/bytes or file location.
-            """
+        with preprocess_data(handle_data) as xml_data:
+            curr_parser = XMLParser(encoding=self.encoding)
 
-            from lxml.etree import (
-                XMLParser,
-                fromstring,
-                parse,
-            )
+            if isinstance(xml_data, io.StringIO):
+                doc = fromstring(
+                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
+                )
+            else:
+                doc = parse(xml_data, parser=curr_parser)
 
-            handle_data = get_data_from_filepath(
-                filepath_or_buffer=raw_doc,
-                encoding=self.encoding,
-                compression=self.compression,
-                storage_options=self.storage_options,
-            )
+        return tostring(doc)
 
-            with preprocess_data(handle_data) as xml_data:
-                curr_parser = XMLParser(encoding=self.encoding)
+    def _transform_doc(self) -> bytes:
+        """
+        Transform original tree using stylesheet.
 
-                if isinstance(xml_data, io.StringIO):
-                    doc = fromstring(
-                        xml_data.getvalue().encode(self.encoding), parser=curr_parser
-                    )
-                else:
-                    doc = parse(xml_data, parser=curr_parser)
-
-            return doc
-
-        def _transform_doc(self) -> Element:
-            """
-            Transform original tree using stylesheet.
-
-            This method will transform original xml using XSLT script into
-            am ideally flatter xml document for easier parsing and migration
-            to Data Frame.
-            """
-            from lxml.etree import (
-                XML,
-                XSLT,
-            )
+        This method will transform original xml using XSLT script into
+        am ideally flatter xml document for easier parsing and migration
+        to Data Frame.
+        """
+        from lxml.etree import XSLT
 
-            transformer = XSLT(self.xsl_doc)
-            new_doc = transformer(self.xml_doc)
+        transformer = XSLT(self.xsl_doc)
+        new_doc = transformer(self.xml_doc)
 
-            return XML(bytes(new_doc))
+        return bytes(new_doc)
 
 
 def get_data_from_filepath(
@@ -707,6 +692,8 @@ def _parse(
         * If parser is not lxml or etree.
     """
 
+    lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
     p: Union[_EtreeFrameParser, _LxmlFrameParser]
 
     if parser == "lxml":

From b197085537ffa7bd83bb49c4c60c3a3cac27dc8a Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Wed, 10 Mar 2021 07:42:20 -0600
Subject: [PATCH 3/5] Adjust base and etree classes for consistent parse_doc
 signature

---
 pandas/io/xml.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index e547588052548..abbfe7657624f 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -187,7 +187,7 @@ def _validate_names(self) -> None:
         """
         raise AbstractMethodError(self)
 
-    def _parse_doc(self) -> bytes:
+    def _parse_doc(self, raw_doc) -> bytes:
         """
         Build tree from path_or_buffer.
 
@@ -214,7 +214,7 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]:
                 "To use stylesheet, you need lxml installed and selected as parser."
             )
 
-        self.xml_doc = XML(self._parse_doc())
+        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
 
         self._validate_path()
         self._validate_names()
@@ -351,7 +351,7 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self) -> bytes:
+    def _parse_doc(self, raw_doc) -> bytes:
         from xml.etree.ElementTree import (
             XMLParser,
             parse,
@@ -359,7 +359,7 @@ def _parse_doc(self) -> bytes:
         )
 
         handle_data = get_data_from_filepath(
-            filepath_or_buffer=self.path_or_buffer,
+            filepath_or_buffer=raw_doc,
             encoding=self.encoding,
             compression=self.compression,
             storage_options=self.storage_options,

From f35d879ba287f45b13a754d6be89cd45fc1c5f51 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Wed, 10 Mar 2021 19:06:53 -0600
Subject: [PATCH 4/5] Add test skip for etree before python 3.8 due to alpha
 order of attributes

---
 pandas/tests/io/xml/test_xml.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 6902b4e93443f..13e6a5344acd4 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -3,6 +3,7 @@
     StringIO,
 )
 import os
+import sys
 from typing import Union
 from urllib.error import HTTPError
 
@@ -253,6 +254,10 @@ def test_parser_consistency_file(datapath):
 @tm.network
 @pytest.mark.slow
 @td.skip_if_no("lxml")
+@pytest.mark.skipif(
+    sys.version_info < (3, 8),
+    reason=("etree alpha ordered attributes <= py3.7"),
+)
 def test_parser_consistency_url(datapath):
     url = (
         "https://data.cityofchicago.org/api/views/"

From d68f1396f793a43ad750b27c50e4f553fc9bb617 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Tue, 16 Mar 2021 11:26:15 -0500
Subject: [PATCH 5/5] Use PY38 variable in tests for python < 3.8 skips

---
 pandas/tests/io/xml/test_to_xml.py | 18 +++++++++---------
 pandas/tests/io/xml/test_xml.py    |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index 97793ce8f65b8..0b95750e94736 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -3,12 +3,12 @@
     StringIO,
 )
 import os
-import sys
 from typing import Union
 
 import numpy as np
 import pytest
 
+from pandas.compat import PY38
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame
@@ -364,8 +364,8 @@ def test_na_empty_elem_option(datapath, parser):
 
 
 @pytest.mark.skipif(
-    sys.version_info < (3, 8),
-    reason=("etree alpha ordered attributes <= py3.7"),
+    not PY38,
+    reason=("etree alpha ordered attributes < py 3.8"),
 )
 def test_attrs_cols_nan_output(datapath, parser):
     expected = """\
@@ -383,8 +383,8 @@ def test_attrs_cols_nan_output(datapath, parser):
 
 
 @pytest.mark.skipif(
-    sys.version_info < (3, 8),
-    reason=("etree alpha ordered attributes <= py3.7"),
+    not PY38,
+    reason=("etree alpha ordered attributes < py3.8"),
 )
 def test_attrs_cols_prefix(datapath, parser):
     expected = """\
@@ -541,8 +541,8 @@ def test_hierarchical_columns(datapath, parser):
 
 
 @pytest.mark.skipif(
-    sys.version_info < (3, 8),
-    reason=("etree alpha ordered attributes <= py3.7"),
+    not PY38,
+    reason=("etree alpha ordered attributes < py3.8"),
 )
 def test_hierarchical_attrs_columns(datapath, parser):
     expected = """\
@@ -614,8 +614,8 @@ def test_multi_index(datapath, parser):
 
 
 @pytest.mark.skipif(
-    sys.version_info < (3, 8),
-    reason=("etree alpha ordered attributes <= py3.7"),
+    not PY38,
+    reason=("etree alpha ordered attributes < py3.8"),
 )
 def test_multi_index_attrs_cols(datapath, parser):
     expected = """\
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 13e6a5344acd4..95751b6090a06 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -3,13 +3,13 @@
     StringIO,
 )
 import os
-import sys
 from typing import Union
 from urllib.error import HTTPError
 
 import numpy as np
 import pytest
 
+from pandas.compat import PY38
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame
@@ -255,8 +255,8 @@ def test_parser_consistency_file(datapath):
 @pytest.mark.slow
 @td.skip_if_no("lxml")
 @pytest.mark.skipif(
-    sys.version_info < (3, 8),
-    reason=("etree alpha ordered attributes <= py3.7"),
+    not PY38,
+    reason=("etree alpha ordered attributes < py3.8"),
 )
 def test_parser_consistency_url(datapath):
     url = (