From 47dedf452e16d3d03a9666adc703b8e06500dbb4 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 9 Mar 2021 20:49:27 -0600 Subject: [PATCH 1/5] TYP: Add typing for remaining IO XML methods with conditional for lxml --- pandas/io/formats/xml.py | 4 +- pandas/io/xml.py | 110 ++++++++++++++++++++++----------------- 2 files changed, 63 insertions(+), 51 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 6987da63eaf3e..5c7255d5e6ee4 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -288,7 +288,7 @@ class EtreeXMLFormatter(BaseXMLFormatter): modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.validate_columns() @@ -452,7 +452,7 @@ class LxmlXMLFormatter(BaseXMLFormatter): modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.validate_columns() diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 83eba5f17c7b3..8be3392062228 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -37,6 +37,8 @@ ) from pandas.io.parsers import TextParser +lxml = import_optional_dependency("lxml.etree", errors="ignore") + class _XMLFrameParser: """ @@ -90,7 +92,6 @@ class _XMLFrameParser: To subclass this class effectively you must override the following methods:` * :func:`parse_data` * :func:`_parse_nodes` - * :func:`_parse_doc` * :func:`_validate_names` * :func:`_validate_path` @@ -111,7 +112,7 @@ def __init__( stylesheet, compression, storage_options, - ): + ) -> None: self.path_or_buffer = path_or_buffer self.xpath = xpath self.namespaces = namespaces @@ -187,16 +188,6 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _parse_doc(self): - """ - Build tree from io. - - This method will parse io object into tree for parsing - conditionally by its specific object type. - """ - - raise AbstractMethodError(self) - class _EtreeFrameParser(_XMLFrameParser): """ @@ -209,7 +200,7 @@ class _EtreeFrameParser(_XMLFrameParser): ElementTree, ) - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) def parse_data(self) -> List[Dict[str, Optional[str]]]: @@ -357,6 +348,12 @@ def _validate_names(self) -> None: ) def _parse_doc(self) -> Union[Element, ElementTree]: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ from xml.etree.ElementTree import ( XMLParser, parse, @@ -383,7 +380,7 @@ class _LxmlFrameParser(_XMLFrameParser): XPath 1.0 and XSLT 1.0. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) def parse_data(self) -> List[Dict[str, Optional[str]]]: @@ -491,21 +488,6 @@ def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: return dicts - def _transform_doc(self): - """ - Transform original tree using stylesheet. - - This method will transform original xml using XSLT script into - am ideally flatter xml document for easier parsing and migration - to Data Frame. - """ - from lxml.etree import XSLT - - transformer = XSLT(self.xsl_doc) - new_doc = transformer(self.xml_doc) - - return new_doc - def _validate_path(self) -> None: msg = ( @@ -553,31 +535,62 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self, raw_doc): + if lxml is not None: from lxml.etree import ( - XMLParser, - fromstring, - parse, + Element, + ElementTree, ) - handle_data = get_data_from_filepath( - filepath_or_buffer=raw_doc, - encoding=self.encoding, - compression=self.compression, - storage_options=self.storage_options, - ) + def _parse_doc(self, raw_doc) -> Union[Element, ElementTree]: + """ + Build tree from path_or_buffer. - with preprocess_data(handle_data) as xml_data: - curr_parser = XMLParser(encoding=self.encoding) + This method will parse XML object into tree + either from string/bytes or file location. + """ - if isinstance(xml_data, io.StringIO): - doc = fromstring( - xml_data.getvalue().encode(self.encoding), parser=curr_parser - ) - else: - doc = parse(xml_data, parser=curr_parser) + from lxml.etree import ( + XMLParser, + fromstring, + parse, + ) + + handle_data = get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + + if isinstance(xml_data, io.StringIO): + doc = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + doc = parse(xml_data, parser=curr_parser) + + return doc + + def _transform_doc(self) -> Element: + """ + Transform original tree using stylesheet. + + This method will transform original xml using XSLT script into + am ideally flatter xml document for easier parsing and migration + to Data Frame. + """ + from lxml.etree import ( + XML, + XSLT, + ) + + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) - return doc + return XML(bytes(new_doc)) def get_data_from_filepath( @@ -694,7 +707,6 @@ def _parse( * If parser is not lxml or etree. """ - lxml = import_optional_dependency("lxml.etree", errors="ignore") p: Union[_EtreeFrameParser, _LxmlFrameParser] if parser == "lxml": From 9c62e6e924c98cc5fdcb15e18e84e1e008b3a7a4 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 10 Mar 2021 07:24:39 -0600 Subject: [PATCH 2/5] Refactor parse_doc method for bytes type hinting and avoid optional dependency import --- pandas/io/xml.py | 117 +++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 8be3392062228..e547588052548 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -37,8 +37,6 @@ ) from pandas.io.parsers import TextParser -lxml = import_optional_dependency("lxml.etree", errors="ignore") - class _XMLFrameParser: """ @@ -92,6 +90,7 @@ class _XMLFrameParser: To subclass this class effectively you must override the following methods:` * :func:`parse_data` * :func:`_parse_nodes` + * :func:`_parse_doc` * :func:`_validate_names` * :func:`_validate_path` @@ -188,6 +187,15 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) + def _parse_doc(self) -> bytes: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ + raise AbstractMethodError(self) + class _EtreeFrameParser(_XMLFrameParser): """ @@ -195,22 +203,18 @@ class _EtreeFrameParser(_XMLFrameParser): standard library XML module: `xml.etree.ElementTree`. """ - from xml.etree.ElementTree import ( - Element, - ElementTree, - ) - def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) def parse_data(self) -> List[Dict[str, Optional[str]]]: + from xml.etree.ElementTree import XML if self.stylesheet is not None: raise ValueError( "To use stylesheet, you need lxml installed and selected as parser." ) - self.xml_doc = self._parse_doc() + self.xml_doc = XML(self._parse_doc()) self._validate_path() self._validate_names() @@ -347,16 +351,11 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self) -> Union[Element, ElementTree]: - """ - Build tree from path_or_buffer. - - This method will parse XML object into tree - either from string/bytes or file location. - """ + def _parse_doc(self) -> bytes: from xml.etree.ElementTree import ( XMLParser, parse, + tostring, ) handle_data = get_data_from_filepath( @@ -370,7 +369,7 @@ def _parse_doc(self) -> Union[Element, ElementTree]: curr_parser = XMLParser(encoding=self.encoding) r = parse(xml_data, parser=curr_parser) - return r + return tostring(r.getroot()) class _LxmlFrameParser(_XMLFrameParser): @@ -391,12 +390,13 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]: validate xpath, names, optionally parse and run XSLT, and parse original or transformed XML and return specific nodes. """ + from lxml.etree import XML - self.xml_doc = self._parse_doc(self.path_or_buffer) + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) if self.stylesheet is not None: - self.xsl_doc = self._parse_doc(self.stylesheet) - self.xml_doc = self._transform_doc() + self.xsl_doc = XML(self._parse_doc(self.stylesheet)) + self.xml_doc = XML(self._transform_doc()) self._validate_path() self._validate_names() @@ -535,62 +535,47 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - if lxml is not None: + def _parse_doc(self, raw_doc) -> bytes: from lxml.etree import ( - Element, - ElementTree, + XMLParser, + fromstring, + parse, + tostring, ) - def _parse_doc(self, raw_doc) -> Union[Element, ElementTree]: - """ - Build tree from path_or_buffer. + handle_data = get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) - This method will parse XML object into tree - either from string/bytes or file location. - """ + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) - from lxml.etree import ( - XMLParser, - fromstring, - parse, - ) + if isinstance(xml_data, io.StringIO): + doc = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + doc = parse(xml_data, parser=curr_parser) - handle_data = get_data_from_filepath( - filepath_or_buffer=raw_doc, - encoding=self.encoding, - compression=self.compression, - storage_options=self.storage_options, - ) + return tostring(doc) - with preprocess_data(handle_data) as xml_data: - curr_parser = XMLParser(encoding=self.encoding) + def _transform_doc(self) -> bytes: + """ + Transform original tree using stylesheet. - if isinstance(xml_data, io.StringIO): - doc = fromstring( - xml_data.getvalue().encode(self.encoding), parser=curr_parser - ) - else: - doc = parse(xml_data, parser=curr_parser) - - return doc - - def _transform_doc(self) -> Element: - """ - Transform original tree using stylesheet. - - This method will transform original xml using XSLT script into - am ideally flatter xml document for easier parsing and migration - to Data Frame. - """ - from lxml.etree import ( - XML, - XSLT, - ) + This method will transform original xml using XSLT script into + am ideally flatter xml document for easier parsing and migration + to Data Frame. + """ + from lxml.etree import XSLT - transformer = XSLT(self.xsl_doc) - new_doc = transformer(self.xml_doc) + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) - return XML(bytes(new_doc)) + return bytes(new_doc) def get_data_from_filepath( @@ -707,6 +692,8 @@ def _parse( * If parser is not lxml or etree. """ + lxml = import_optional_dependency("lxml.etree", errors="ignore") + p: Union[_EtreeFrameParser, _LxmlFrameParser] if parser == "lxml": From b197085537ffa7bd83bb49c4c60c3a3cac27dc8a Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 10 Mar 2021 07:42:20 -0600 Subject: [PATCH 3/5] Adjust base and etree classes for consistent parse_doc signature --- pandas/io/xml.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index e547588052548..abbfe7657624f 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -187,7 +187,7 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _parse_doc(self) -> bytes: + def _parse_doc(self, raw_doc) -> bytes: """ Build tree from path_or_buffer. @@ -214,7 +214,7 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]: "To use stylesheet, you need lxml installed and selected as parser." ) - self.xml_doc = XML(self._parse_doc()) + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) self._validate_path() self._validate_names() @@ -351,7 +351,7 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self) -> bytes: + def _parse_doc(self, raw_doc) -> bytes: from xml.etree.ElementTree import ( XMLParser, parse, @@ -359,7 +359,7 @@ def _parse_doc(self) -> bytes: ) handle_data = get_data_from_filepath( - filepath_or_buffer=self.path_or_buffer, + filepath_or_buffer=raw_doc, encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, From f35d879ba287f45b13a754d6be89cd45fc1c5f51 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 10 Mar 2021 19:06:53 -0600 Subject: [PATCH 4/5] Add test skip for etree before python 3.8 due to alpha order of attributes --- pandas/tests/io/xml/test_xml.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6902b4e93443f..13e6a5344acd4 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -3,6 +3,7 @@ StringIO, ) import os +import sys from typing import Union from urllib.error import HTTPError @@ -253,6 +254,10 @@ def test_parser_consistency_file(datapath): @tm.network @pytest.mark.slow @td.skip_if_no("lxml") +@pytest.mark.skipif( + sys.version_info < (3, 8), + reason=("etree alpha ordered attributes <= py3.7"), +) def test_parser_consistency_url(datapath): url = ( "https://data.cityofchicago.org/api/views/" From d68f1396f793a43ad750b27c50e4f553fc9bb617 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 16 Mar 2021 11:26:15 -0500 Subject: [PATCH 5/5] Use PY38 variable in tests for python < 3.8 skips --- pandas/tests/io/xml/test_to_xml.py | 18 +++++++++--------- pandas/tests/io/xml/test_xml.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 97793ce8f65b8..0b95750e94736 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -3,12 +3,12 @@ StringIO, ) import os -import sys from typing import Union import numpy as np import pytest +from pandas.compat import PY38 import pandas.util._test_decorators as td from pandas import DataFrame @@ -364,8 +364,8 @@ def test_na_empty_elem_option(datapath, parser): @pytest.mark.skipif( - sys.version_info < (3, 8), - reason=("etree alpha ordered attributes <= py3.7"), + not PY38, + reason=("etree alpha ordered attributes < py 3.8"), ) def test_attrs_cols_nan_output(datapath, parser): expected = """\ @@ -383,8 +383,8 @@ def test_attrs_cols_nan_output(datapath, parser): @pytest.mark.skipif( - sys.version_info < (3, 8), - reason=("etree alpha ordered attributes <= py3.7"), + not PY38, + reason=("etree alpha ordered attributes < py3.8"), ) def test_attrs_cols_prefix(datapath, parser): expected = """\ @@ -541,8 +541,8 @@ def test_hierarchical_columns(datapath, parser): @pytest.mark.skipif( - sys.version_info < (3, 8), - reason=("etree alpha ordered attributes <= py3.7"), + not PY38, + reason=("etree alpha ordered attributes < py3.8"), ) def test_hierarchical_attrs_columns(datapath, parser): expected = """\ @@ -614,8 +614,8 @@ def test_multi_index(datapath, parser): @pytest.mark.skipif( - sys.version_info < (3, 8), - reason=("etree alpha ordered attributes <= py3.7"), + not PY38, + reason=("etree alpha ordered attributes < py3.8"), ) def test_multi_index_attrs_cols(datapath, parser): expected = """\ diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 13e6a5344acd4..95751b6090a06 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -3,13 +3,13 @@ StringIO, ) import os -import sys from typing import Union from urllib.error import HTTPError import numpy as np import pytest +from pandas.compat import PY38 import pandas.util._test_decorators as td from pandas import DataFrame @@ -255,8 +255,8 @@ def test_parser_consistency_file(datapath): @pytest.mark.slow @td.skip_if_no("lxml") @pytest.mark.skipif( - sys.version_info < (3, 8), - reason=("etree alpha ordered attributes <= py3.7"), + not PY38, + reason=("etree alpha ordered attributes < py3.8"), ) def test_parser_consistency_url(datapath): url = (