From 0a5a5c20df1b1e56457323cfb47dca12f5df76f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Duarte?= Date: Thu, 2 Dec 2021 16:57:45 +0000 Subject: [PATCH 1/5] BUG: allow ExtensionDtypes JSON (de)serialization --- doc/source/development/developer.rst | 2 +- doc/source/user_guide/io.rst | 17 + doc/source/whatsnew/v1.3.5.rst | 1 - doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/generic.py | 7 +- pandas/io/json/_json.py | 4 +- pandas/io/json/_table_schema.py | 14 +- pandas/tests/extension/decimal/array.py | 7 +- .../json/test_json_table_schema_ext_dtype.py | 337 ++++++++++++++++++ pandas/tests/io/json/test_pandas.py | 2 +- 10 files changed, 381 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/io/json/test_json_table_schema_ext_dtype.py diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index d701208792a4c..6de237b70f08d 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -180,7 +180,7 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0', + 'pandas_version': '1.4.0', 'creator': { 'library': 'pyarrow', 'version': '0.13.0' diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e21dac65c4c4c..3833041ff343b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1891,6 +1891,7 @@ with optional parameters: ``index``; dict like {index -> {column -> value}} ``columns``; dict like {column -> {index -> value}} ``values``; just the values array + ``table``; adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -1907,6 +1908,18 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet json = dfj.to_json() json +.. note:: + + When using ``orient='table'`` along with user-defined ``ExtensionArray``, + the generated schema will contain an additional ``extDtype`` key in the respective + ``fields`` element. This extra key is not standard but does enable JSON roundtrips + for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). + + The ``extDtype`` key carries the name of the extension, if you have properly registered + the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry + and re-convert the serialized data into your custom dtype. + + Orient options ++++++++++++++ @@ -2465,6 +2478,10 @@ A few notes on the generated table schema: * For ``MultiIndex``, ``mi.names`` is used. If any level has no name, then ``level_`` is used. +* When using a ``DataFrame`` containing a ``Series`` backed by a used-defined + ``ExtensionArray``, the generated JSON will contain an extra ``extDtype`` + key under the respective ``fields`` array element. While this key is not standard + it enables roundtripping for custom types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index 048cd978c4478..19f561c827ecb 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -29,7 +29,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ - -- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fd7cb6a69d955..0da8936e9bc5c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -218,6 +218,7 @@ Other enhancements ``USFederalHolidayCalendar``. See also `Other API changes`_. - :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) - :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) +- :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4aff7acc4c6fb..bade019a41e48 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2444,6 +2444,11 @@ def to_json( ``orient='table'`` contains a 'pandas_version' field under 'schema'. This stores the version of `pandas` used in the latest revision of the schema. + When using :class:`ExtensionDtype`-kind columns, the schema fields will + carry 'extDtype', this field stores the :class:`ExtensionDtype` name + and is used to resolve the correct dtype during deserialization. + This procedure is handled by the :class:`ExtensionDtype`'s + :func:`_from_sequence` method. Examples -------- @@ -2567,7 +2572,7 @@ def to_json( "primaryKey": [ "index" ], - "pandas_version": "0.20.0" + "pandas_version": "1.4.0" }}, "data": [ {{ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 62f542de3437f..21d89f18d4959 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -68,8 +68,6 @@ loads = json.loads dumps = json.dumps -TABLE_SCHEMA_VERSION = "0.20.0" - # interface to/from def to_json( @@ -565,7 +563,7 @@ def read_json( {{"name":"col 1","type":"string"}},\ {{"name":"col 2","type":"string"}}],\ "primaryKey":["index"],\ -"pandas_version":"0.20.0"}},\ +"pandas_version":"1.4.0"}},\ "data":[\ {{"index":"row 1","col 1":"a","col 2":"b"}},\ {{"index":"row 2","col 1":"c","col 2":"d"}}]\ diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 75fd950cd6076..deabc2f6f4ccd 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,11 +18,13 @@ JSONSerializable, ) +from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -40,6 +42,8 @@ loads = json.loads +TABLE_SCHEMA_VERSION = "1.4.0" + def as_json_table_type(x: DtypeObj) -> str: """ @@ -83,6 +87,8 @@ def as_json_table_type(x: DtypeObj) -> str: return "duration" elif is_categorical_dtype(x): return "any" + if is_extension_array_dtype(x): + return "any" elif is_string_dtype(x): return "string" else: @@ -134,6 +140,8 @@ def convert_pandas_type_to_json_field(arr): field["freq"] = dtype.freq.freqstr elif is_datetime64tz_dtype(dtype): field["tz"] = dtype.tz.zone + elif is_extension_array_dtype(dtype): + field["extDtype"] = dtype.name return field @@ -199,6 +207,8 @@ def convert_json_field_to_pandas_type(field): return CategoricalDtype( categories=field["constraints"]["enum"], ordered=field["ordered"] ) + if "extDtype" in field: + return registry.find(field["extDtype"]) else: return "object" @@ -257,7 +267,7 @@ def build_table_schema( {'name': 'B', 'type': 'string'}, \ {'name': 'C', 'type': 'datetime'}], \ 'primaryKey': ['idx'], \ -'pandas_version': '0.20.0'} +'pandas_version': '1.4.0'} """ if index is True: data = set_default_names(data) @@ -291,7 +301,7 @@ def build_table_schema( schema["primaryKey"] = primary_key if version: - schema["pandas_version"] = "0.20.0" + schema["pandas_version"] = TABLE_SCHEMA_VERSION return schema diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index fe7ebe4f4fb51..49bf75d5f46bd 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -67,8 +67,11 @@ class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray): def __init__(self, values, dtype=None, copy=False, context=None): for i, val in enumerate(values): - if is_float(val) and np.isnan(val): - values[i] = DecimalDtype.na_value + if is_float(val): + if np.isnan(val): + values[i] = DecimalDtype.na_value + else: + values[i] = DecimalDtype.type(val) elif not isinstance(val, decimal.Decimal): raise TypeError("All values must be of type " + str(decimal.Decimal)) values = np.asarray(values, dtype=object) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py new file mode 100644 index 0000000000000..5a9651cf23cde --- /dev/null +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -0,0 +1,337 @@ +"""Tests for ExtensionDtype Table Schema integration.""" + +from collections import OrderedDict +import datetime as dt +import decimal +import json +from typing import ( + Any, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +import numpy as np +import pytest + +from pandas._typing import ( + Dtype, + PositionalIndexer, +) + +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas import DataFrame +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, +) +from pandas.api.types import pandas_dtype +from pandas.core.series import Series +from pandas.tests.extension.decimal.array import ( + DecimalArray, + DecimalDtype, +) + +from pandas.io.json._table_schema import ( + as_json_table_type, + build_table_schema, +) + + +@register_extension_dtype +class DateDtype(ExtensionDtype): + @property + def type(self): + return dt.date + + @property + def name(self): + return "DateDtype" + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + return DateArray + + @property + def na_value(self): + return dt.date.min + + def __repr__(self) -> str: + return self.name + + +class DateArray(ExtensionArray): + def __init__( + self, + dates: Union[ + dt.date, + Sequence[dt.date], + Tuple[np.ndarray, np.ndarray, np.ndarray], + np.ndarray, + ], + ) -> None: + if isinstance(dates, dt.date): + self._year = np.array([dates.year]) + self._month = np.array([dates.month]) + self._day = np.array([dates.year]) + return + + ldates = len(dates) + if isinstance(dates, list): + # pre-allocate the arrays since we know the size before hand + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + # populate them + for i, (y, m, d) in enumerate( + map(lambda date: (date.year, date.month, date.day), dates) + ): + self._year[i] = y + self._month[i] = m + self._day[i] = d + + elif isinstance(dates, tuple): + # only support triples + if ldates != 3: + raise ValueError("only triples are valid") + # check if all elements have the same type + if any(map(lambda x: not isinstance(x, np.ndarray), dates)): + raise TypeError("invalid type") + ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) + if not ly == lm == ld: + raise ValueError( + f"tuple members must have the same length: {(ly, lm, ld)}" + ) + self._year = dates[0].astype(np.uint16) + self._month = dates[1].astype(np.uint8) + self._day = dates[2].astype(np.uint8) + + elif isinstance(dates, np.ndarray) and dates.dtype == "U10": + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + + for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): + self._year[i] = int(y) + self._month[i] = int(m) + self._day[i] = int(d) + + else: + raise TypeError(f"{type(dates)} is not supported") + + @property + def dtype(self) -> ExtensionDtype: + return DateDtype() + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if isinstance(dtype, DateDtype): + data = self.copy() if copy else self + else: + data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) + + return data + + @property + def nbytes(self) -> int: + return self._year.nbytes + self._month.nbytes + self._day.nbytes + + def __len__(self) -> int: + return len(self._year) # all 3 arrays are enforced to have the same length + + def __getitem__(self, item: PositionalIndexer): + if isinstance(item, int): + return dt.date(self._year[item], self._month[item], self._day[item]) + else: + raise NotImplementedError("only ints are supported as indexes") + + def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any): + if not isinstance(key, int): + raise NotImplementedError("only ints are supported as indexes") + + if not isinstance(value, dt.date): + raise TypeError("you can only set datetime.date types") + + self._year[key] = value.year + self._month[key] = value.month + self._day[key] = value.day + + def __repr__(self) -> str: + return f"DateArray{list(zip(self._year, self._month, self._day))}" + + def copy(self) -> "DateArray": + return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) + + def isna(self) -> np.ndarray: + return np.logical_and( + np.logical_and( + self._year == dt.date.min.year, self._month == dt.date.min.month + ), + self._day == dt.date.min.day, + ) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): + if isinstance(scalars, dt.date): + pass + elif isinstance(scalars, DateArray): + pass + elif isinstance(scalars, np.ndarray): + scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd + return DateArray(scalars) + + +class TestBuildSchema: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + } + ) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "A", "type": "any", "extDtype": "DateDtype"}, + {"name": "B", "type": "any", "extDtype": "decimal"}, + ], + "primaryKey": ["index"], + } + assert result == expected + result = build_table_schema(self.df) + assert "pandas_version" in result + + +class TestTableSchemaType: + @pytest.mark.parametrize( + "date_data", + [ + DateArray([dt.date(2021, 10, 10)]), + DateArray(dt.date(2021, 10, 10)), + Series(DateArray(dt.date(2021, 10, 10))), + ], + ) + def test_as_json_table_type_ext_date_array_dtype(self, date_data): + assert as_json_table_type(date_data.dtype) == "any" + + def test_as_json_table_type_ext_date_dtype(self): + assert as_json_table_type(DateDtype()) == "any" + + @pytest.mark.parametrize( + "decimal_data", + [ + DecimalArray([decimal.Decimal(10)]), + Series(DecimalArray([decimal.Decimal(10)])), + ], + ) + def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): + assert as_json_table_type(decimal_data.dtype) == "any" + + def test_as_json_table_type_ext_decimal_dtype(self): + assert as_json_table_type(DecimalDtype()) == "any" + + +class TestTableOrient: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + } + ) + + def test_build_date_series(self): + s = Series(self.da, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "DateDtype"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]), + ] + ) + + assert result == expected + + def test_build_decimal_series(self): + s = Series(self.dc, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "decimal"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ] + ) + + assert result == expected + + def test_to_json(self): + df = self.df.copy() + df.index.name = "idx" + result = df.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + OrderedDict({"name": "idx", "type": "integer"}), + OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), + OrderedDict({"name": "B", "type": "any", "extDtype": "decimal"}), + ] + + schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]}) + data = [ + OrderedDict([("idx", 0), ("A", "2021-10-10T00:00:00.000Z"), ("B", 10.0)]) + ] + expected = OrderedDict([("schema", schema), ("data", data)]) + + assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 747770ad78684..517e3e18c1a37 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1675,7 +1675,7 @@ def test_to_json_indent(self, indent): "primaryKey":[ "index" ], - "pandas_version":"0.20.0" + "pandas_version":"1.4.0" }, "data":[ { From 29cd0119474865695cfaa69948f5d9eab6058bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Duarte?= Date: Wed, 8 Dec 2021 21:29:48 +0000 Subject: [PATCH 2/5] apply suggestions --- doc/source/user_guide/io.rst | 29 ++++++++++++----------------- pandas/core/generic.py | 5 ----- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 750810dcec84a..fed63b333b4ed 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1920,18 +1920,6 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet json = dfj.to_json() json -.. note:: - - When using ``orient='table'`` along with user-defined ``ExtensionArray``, - the generated schema will contain an additional ``extDtype`` key in the respective - ``fields`` element. This extra key is not standard but does enable JSON roundtrips - for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). - - The ``extDtype`` key carries the name of the extension, if you have properly registered - the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry - and re-convert the serialized data into your custom dtype. - - Orient options ++++++++++++++ @@ -2490,11 +2478,6 @@ A few notes on the generated table schema: * For ``MultiIndex``, ``mi.names`` is used. If any level has no name, then ``level_`` is used. -* When using a ``DataFrame`` containing a ``Series`` backed by a used-defined - ``ExtensionArray``, the generated JSON will contain an extra ``extDtype`` - key under the respective ``fields`` array element. While this key is not standard - it enables roundtripping for custom types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). - ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a round-trippable manner. @@ -2538,6 +2521,18 @@ indicate missing values and the subsequent read cannot distinguish the intent. .. _Table Schema: https://specs.frictionlessdata.io/table-schema/ +.. note:: + + When using ``orient='table'`` along with user-defined ``ExtensionArray``, + the generated schema will contain an additional ``extDtype`` key in the respective + ``fields`` element. This extra key is not standard but does enable JSON roundtrips + for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). + + The ``extDtype`` key carries the name of the extension, if you have properly registered + the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry + and re-convert the serialized data into your custom dtype. + + HTML ---- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0489e7ff30f09..350984da3a78a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2444,11 +2444,6 @@ def to_json( ``orient='table'`` contains a 'pandas_version' field under 'schema'. This stores the version of `pandas` used in the latest revision of the schema. - When using :class:`ExtensionDtype`-kind columns, the schema fields will - carry 'extDtype', this field stores the :class:`ExtensionDtype` name - and is used to resolve the correct dtype during deserialization. - This procedure is handled by the :class:`ExtensionDtype`'s - :func:`_from_sequence` method. Examples -------- From 9d87d98fd4f62e293f0c764184c174713883d823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Duarte?= Date: Thu, 9 Dec 2021 19:16:25 +0000 Subject: [PATCH 3/5] add tests over Int64 and String arrays --- .../json/test_json_table_schema_ext_dtype.py | 105 +++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 5a9651cf23cde..cce86f48d5784 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -23,12 +23,17 @@ from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas import DataFrame +from pandas import ( + DataFrame, + array, +) from pandas.api.extensions import ( ExtensionArray, ExtensionDtype, ) from pandas.api.types import pandas_dtype +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.series import Series from pandas.tests.extension.decimal.array import ( DecimalArray, @@ -201,10 +206,14 @@ class TestBuildSchema: def setup_method(self, method): self.da = DateArray([dt.date(2021, 10, 10)]) self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") self.df = DataFrame( { "A": self.da, "B": self.dc, + "C": self.sa, + "D": self.ia, } ) @@ -215,6 +224,8 @@ def test_build_table_schema(self): {"name": "index", "type": "integer"}, {"name": "A", "type": "any", "extDtype": "DateDtype"}, {"name": "B", "type": "any", "extDtype": "decimal"}, + {"name": "C", "type": "any", "extDtype": "string"}, + {"name": "D", "type": "integer", "extDtype": "Int64"}, ], "primaryKey": ["index"], } @@ -251,15 +262,45 @@ def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): def test_as_json_table_type_ext_decimal_dtype(self): assert as_json_table_type(DecimalDtype()) == "any" + @pytest.mark.parametrize( + "string_data", + [ + array(["pandas"], dtype="string"), + Series(array(["pandas"], dtype="string")), + ], + ) + def test_as_json_table_type_ext_string_array_dtype(self, string_data): + assert as_json_table_type(string_data.dtype) == "any" + + def test_as_json_table_type_ext_string_dtype(self): + assert as_json_table_type(StringDtype()) == "any" + + @pytest.mark.parametrize( + "integer_data", + [ + array([10], dtype="Int64"), + Series(array([10], dtype="Int64")), + ], + ) + def test_as_json_table_type_ext_integer_array_dtype(self, integer_data): + assert as_json_table_type(integer_data.dtype) == "integer" + + def test_as_json_table_type_ext_integer_dtype(self): + assert as_json_table_type(Int64Dtype()) == "integer" + class TestTableOrient: def setup_method(self, method): self.da = DateArray([dt.date(2021, 10, 10)]) self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") self.df = DataFrame( { "A": self.da, "B": self.dc, + "C": self.sa, + "D": self.ia, } ) @@ -313,6 +354,56 @@ def test_build_decimal_series(self): assert result == expected + def test_build_string_series(self): + s = Series(self.sa, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "string"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "pandas")])]), + ] + ) + + assert result == expected + + def test_build_int64_series(self): + s = Series(self.ia, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "integer", "extDtype": "Int64"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10)])]), + ] + ) + + assert result == expected + def test_to_json(self): df = self.df.copy() df.index.name = "idx" @@ -326,11 +417,21 @@ def test_to_json(self): OrderedDict({"name": "idx", "type": "integer"}), OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), OrderedDict({"name": "B", "type": "any", "extDtype": "decimal"}), + OrderedDict({"name": "C", "type": "any", "extDtype": "string"}), + OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}), ] schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]}) data = [ - OrderedDict([("idx", 0), ("A", "2021-10-10T00:00:00.000Z"), ("B", 10.0)]) + OrderedDict( + [ + ("idx", 0), + ("A", "2021-10-10T00:00:00.000Z"), + ("B", 10.0), + ("C", "pandas"), + ("D", 10), + ] + ) ] expected = OrderedDict([("schema", schema), ("data", data)]) From 6ad85773a8491be6471b68cd5d4e0d3a76dc7e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Duarte?= Date: Sun, 19 Dec 2021 15:33:20 +0000 Subject: [PATCH 4/5] apply suggestions --- doc/source/user_guide/io.rst | 18 +- pandas/io/json/_table_schema.py | 4 +- pandas/tests/extension/date/__init__.py | 6 + pandas/tests/extension/date/array.py | 180 +++++++++++++++++ .../json/test_json_table_schema_ext_dtype.py | 181 +----------------- pandas/tests/io/json/test_pandas.py | 2 +- 6 files changed, 201 insertions(+), 190 deletions(-) create mode 100644 pandas/tests/extension/date/__init__.py create mode 100644 pandas/tests/extension/date/array.py diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fed63b333b4ed..403599297a492 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2519,18 +2519,16 @@ indicate missing values and the subsequent read cannot distinguish the intent. os.remove("test.json") -.. _Table Schema: https://specs.frictionlessdata.io/table-schema/ - -.. note:: +When using ``orient='table'`` along with user-defined ``ExtensionArray``, +the generated schema will contain an additional ``extDtype`` key in the respective +``fields`` element. This extra key is not standard but does enable JSON roundtrips +for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). - When using ``orient='table'`` along with user-defined ``ExtensionArray``, - the generated schema will contain an additional ``extDtype`` key in the respective - ``fields`` element. This extra key is not standard but does enable JSON roundtrips - for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). +The ``extDtype`` key carries the name of the extension, if you have properly registered +the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry +and re-convert the serialized data into your custom dtype. - The ``extDtype`` key carries the name of the extension, if you have properly registered - the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry - and re-convert the serialized data into your custom dtype. +.. _Table Schema: https://specs.frictionlessdata.io/table-schema/ HTML diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d1f75afcbb063..cb2d426f6b81b 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -87,7 +87,7 @@ def as_json_table_type(x: DtypeObj) -> str: return "duration" elif is_categorical_dtype(x): return "any" - if is_extension_array_dtype(x): + elif is_extension_array_dtype(x): return "any" elif is_string_dtype(x): return "string" @@ -203,7 +203,7 @@ def convert_json_field_to_pandas_type(field): return CategoricalDtype( categories=field["constraints"]["enum"], ordered=field["ordered"] ) - if "extDtype" in field: + elif "extDtype" in field: return registry.find(field["extDtype"]) else: return "object" diff --git a/pandas/tests/extension/date/__init__.py b/pandas/tests/extension/date/__init__.py new file mode 100644 index 0000000000000..2a8c7e9f57a5d --- /dev/null +++ b/pandas/tests/extension/date/__init__.py @@ -0,0 +1,6 @@ +from pandas.tests.extension.date.array import ( + DateArray, + DateDtype, +) + +__all__ = ["DateArray", "DateDtype"] diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py new file mode 100644 index 0000000000000..d29ed293e71ed --- /dev/null +++ b/pandas/tests/extension/date/array.py @@ -0,0 +1,180 @@ +import datetime as dt +from typing import ( + Any, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +import numpy as np + +from pandas._typing import ( + Dtype, + PositionalIndexer, +) + +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, +) +from pandas.api.types import pandas_dtype + + +@register_extension_dtype +class DateDtype(ExtensionDtype): + @property + def type(self): + return dt.date + + @property + def name(self): + return "DateDtype" + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + return DateArray + + @property + def na_value(self): + return dt.date.min + + def __repr__(self) -> str: + return self.name + + +class DateArray(ExtensionArray): + def __init__( + self, + dates: Union[ + dt.date, + Sequence[dt.date], + Tuple[np.ndarray, np.ndarray, np.ndarray], + np.ndarray, + ], + ) -> None: + if isinstance(dates, dt.date): + self._year = np.array([dates.year]) + self._month = np.array([dates.month]) + self._day = np.array([dates.year]) + return + + ldates = len(dates) + if isinstance(dates, list): + # pre-allocate the arrays since we know the size before hand + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + # populate them + for i, (y, m, d) in enumerate( + map(lambda date: (date.year, date.month, date.day), dates) + ): + self._year[i] = y + self._month[i] = m + self._day[i] = d + + elif isinstance(dates, tuple): + # only support triples + if ldates != 3: + raise ValueError("only triples are valid") + # check if all elements have the same type + if any(map(lambda x: not isinstance(x, np.ndarray), dates)): + raise TypeError("invalid type") + ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) + if not ly == lm == ld: + raise ValueError( + f"tuple members must have the same length: {(ly, lm, ld)}" + ) + self._year = dates[0].astype(np.uint16) + self._month = dates[1].astype(np.uint8) + self._day = dates[2].astype(np.uint8) + + elif isinstance(dates, np.ndarray) and dates.dtype == "U10": + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + + for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): + self._year[i] = int(y) + self._month[i] = int(m) + self._day[i] = int(d) + + else: + raise TypeError(f"{type(dates)} is not supported") + + @property + def dtype(self) -> ExtensionDtype: + return DateDtype() + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if isinstance(dtype, DateDtype): + data = self.copy() if copy else self + else: + data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) + + return data + + @property + def nbytes(self) -> int: + return self._year.nbytes + self._month.nbytes + self._day.nbytes + + def __len__(self) -> int: + return len(self._year) # all 3 arrays are enforced to have the same length + + def __getitem__(self, item: PositionalIndexer): + if isinstance(item, int): + return dt.date(self._year[item], self._month[item], self._day[item]) + else: + raise NotImplementedError("only ints are supported as indexes") + + def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any): + if not isinstance(key, int): + raise NotImplementedError("only ints are supported as indexes") + + if not isinstance(value, dt.date): + raise TypeError("you can only set datetime.date types") + + self._year[key] = value.year + self._month[key] = value.month + self._day[key] = value.day + + def __repr__(self) -> str: + return f"DateArray{list(zip(self._year, self._month, self._day))}" + + def copy(self) -> "DateArray": + return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) + + def isna(self) -> np.ndarray: + return np.logical_and( + np.logical_and( + self._year == dt.date.min.year, self._month == dt.date.min.month + ), + self._day == dt.date.min.day, + ) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): + if isinstance(scalars, dt.date): + pass + elif isinstance(scalars, DateArray): + pass + elif isinstance(scalars, np.ndarray): + scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd + return DateArray(scalars) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index cce86f48d5784..3daac204aa730 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -4,37 +4,20 @@ import datetime as dt import decimal import json -from typing import ( - Any, - Optional, - Sequence, - Tuple, - Union, - cast, -) -import numpy as np import pytest -from pandas._typing import ( - Dtype, - PositionalIndexer, -) - -from pandas.core.dtypes.dtypes import register_extension_dtype - from pandas import ( DataFrame, array, ) -from pandas.api.extensions import ( - ExtensionArray, - ExtensionDtype, -) -from pandas.api.types import pandas_dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.string_ import StringDtype from pandas.core.series import Series +from pandas.tests.extension.date import ( + DateArray, + DateDtype, +) from pandas.tests.extension.decimal.array import ( DecimalArray, DecimalDtype, @@ -46,162 +29,6 @@ ) -@register_extension_dtype -class DateDtype(ExtensionDtype): - @property - def type(self): - return dt.date - - @property - def name(self): - return "DateDtype" - - @classmethod - def construct_from_string(cls, string: str): - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - - if string == cls.__name__: - return cls() - else: - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") - - @classmethod - def construct_array_type(cls): - return DateArray - - @property - def na_value(self): - return dt.date.min - - def __repr__(self) -> str: - return self.name - - -class DateArray(ExtensionArray): - def __init__( - self, - dates: Union[ - dt.date, - Sequence[dt.date], - Tuple[np.ndarray, np.ndarray, np.ndarray], - np.ndarray, - ], - ) -> None: - if isinstance(dates, dt.date): - self._year = np.array([dates.year]) - self._month = np.array([dates.month]) - self._day = np.array([dates.year]) - return - - ldates = len(dates) - if isinstance(dates, list): - # pre-allocate the arrays since we know the size before hand - self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) - self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) - self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) - # populate them - for i, (y, m, d) in enumerate( - map(lambda date: (date.year, date.month, date.day), dates) - ): - self._year[i] = y - self._month[i] = m - self._day[i] = d - - elif isinstance(dates, tuple): - # only support triples - if ldates != 3: - raise ValueError("only triples are valid") - # check if all elements have the same type - if any(map(lambda x: not isinstance(x, np.ndarray), dates)): - raise TypeError("invalid type") - ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) - if not ly == lm == ld: - raise ValueError( - f"tuple members must have the same length: {(ly, lm, ld)}" - ) - self._year = dates[0].astype(np.uint16) - self._month = dates[1].astype(np.uint8) - self._day = dates[2].astype(np.uint8) - - elif isinstance(dates, np.ndarray) and dates.dtype == "U10": - self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) - self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) - self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) - - for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): - self._year[i] = int(y) - self._month[i] = int(m) - self._day[i] = int(d) - - else: - raise TypeError(f"{type(dates)} is not supported") - - @property - def dtype(self) -> ExtensionDtype: - return DateDtype() - - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - - if isinstance(dtype, DateDtype): - data = self.copy() if copy else self - else: - data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) - - return data - - @property - def nbytes(self) -> int: - return self._year.nbytes + self._month.nbytes + self._day.nbytes - - def __len__(self) -> int: - return len(self._year) # all 3 arrays are enforced to have the same length - - def __getitem__(self, item: PositionalIndexer): - if isinstance(item, int): - return dt.date(self._year[item], self._month[item], self._day[item]) - else: - raise NotImplementedError("only ints are supported as indexes") - - def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any): - if not isinstance(key, int): - raise NotImplementedError("only ints are supported as indexes") - - if not isinstance(value, dt.date): - raise TypeError("you can only set datetime.date types") - - self._year[key] = value.year - self._month[key] = value.month - self._day[key] = value.day - - def __repr__(self) -> str: - return f"DateArray{list(zip(self._year, self._month, self._day))}" - - def copy(self) -> "DateArray": - return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) - - def isna(self) -> np.ndarray: - return np.logical_and( - np.logical_and( - self._year == dt.date.min.year, self._month == dt.date.min.month - ), - self._day == dt.date.min.day, - ) - - @classmethod - def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): - if isinstance(scalars, dt.date): - pass - elif isinstance(scalars, DateArray): - pass - elif isinstance(scalars, np.ndarray): - scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd - return DateArray(scalars) - - class TestBuildSchema: def setup_method(self, method): self.da = DateArray([dt.date(2021, 10, 10)]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1cfda5c096fba..97927270d0be6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1673,7 +1673,7 @@ def test_to_json_indent(self, indent): "primaryKey":[ "index" ], - "pandas_version":"1.4.0" + "pandas_version":"0.2.0" }, "data":[ { From a342ca7edebaa866878e514bbb8aaf3523e47200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=CC=81=20Duarte?= Date: Sun, 19 Dec 2021 18:36:18 +0000 Subject: [PATCH 5/5] fix version --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 97927270d0be6..1cfda5c096fba 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1673,7 +1673,7 @@ def test_to_json_indent(self, indent): "primaryKey":[ "index" ], - "pandas_version":"0.2.0" + "pandas_version":"1.4.0" }, "data":[ {