diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..712635d7a7e2a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -39,6 +39,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_fwf` * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` @@ -49,6 +50,7 @@ to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``) * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_parquet` * :func:`read_orc` diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 478e7eaee90c1..9154e836b3477 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -292,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr, result[i] = 1 elif val in false_vals: result[i] = 0 - elif is_nan(val): + elif is_nan(val) or val is None: mask[i] = 1 result[i] = 0 # Value here doesn't matter, will be replaced w/ nan has_na = True diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 4f61455826286..1368a407fa494 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -774,6 +774,7 @@ def _parse( iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, + use_nullable_dtypes: bool = False, **kwargs, ) -> DataFrame: """ @@ -843,6 +844,7 @@ def _parse( dtype=dtype, converters=converters, parse_dates=parse_dates, + use_nullable_dtypes=use_nullable_dtypes, **kwargs, ) @@ -869,6 +871,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -980,6 +983,19 @@ def read_xml( {storage_options} + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- df @@ -1113,4 +1129,5 @@ def read_xml( iterparse=iterparse, compression=compression, storage_options=storage_options, + use_nullable_dtypes=use_nullable_dtypes, ) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index aeaf2d3b7edbf..d65b9b8af4365 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -21,8 +21,17 @@ ) import pandas.util._test_decorators as td -from pandas import DataFrame +import pandas as pd +from pandas import ( + NA, + DataFrame, + Series, +) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1702,3 +1711,74 @@ def test_s3_parser_consistency(): ) tm.assert_frame_equal(df_lxml, df_etree) + + +@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) +def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): + # GH#50500 + if string_storage == "pyarrow" or dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + data = """ + + + x + 1 + 4.0 + x + 2 + 4.0 + + True + False + + + y + 2 + 5.0 + + + + + False + + +""" + + if string_storage == "python": + string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["x", "y"])) + string_array_na = ArrowStringArray(pa.array(["x", None])) + + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_xml(data, parser=parser, use_nullable_dtypes=True) + + expected = DataFrame( + { + "a": string_array, + "b": Series([1, 2], dtype="Int64"), + "c": Series([4.0, 5.0], dtype="Float64"), + "d": string_array_na, + "e": Series([2, NA], dtype="Int64"), + "f": Series([4.0, NA], dtype="Float64"), + "g": Series([NA, NA], dtype="Int64"), + "h": Series([True, False], dtype="boolean"), + "i": Series([False, NA], dtype="boolean"), + } + ) + + if dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + expected["g"] = ArrowExtensionArray(pa.array([None, None])) + + tm.assert_frame_equal(result, expected)