googleapis
diff --git a/‎db_dtypes/__init__.py
Lines changed: 3 additions & 0 deletions b/‎db_dtypes/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎db_dtypes/json.py
Lines changed: 273 additions & 0 deletions b/‎db_dtypes/json.py
Lines changed: 273 additions & 0 deletions
@@ -28,6 +28,7 @@
 import pyarrow.compute
 
 from db_dtypes import core
+from db_dtypes.json import JSONArray, JSONDtype
 from db_dtypes.version import __version__
 
 date_dtype_name = "dbdate"
@@ -341,6 +342,8 @@ def __sub__(self, other):
     "__version__",
     "DateArray",
     "DateDtype",
+    "JSONDtype",
+    "JSONArray",
     "TimeArray",
     "TimeDtype",
 ]
@@ -0,0 +1,273 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from collections import UserDict, abc
+import itertools
+import numbers
+import string
+import sys
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import pandas as pd
+from pandas.api.extensions import ExtensionArray, ExtensionDtype
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype
+from pandas.core.indexers import unpack_tuple_and_ellipses
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from pandas._typing import type_t
+
+
+@pd.api.extensions.register_extension_dtype
+class JSONDtype(pd.api.extensions.ExtensionDtype):
+    """Extension dtype for JSON data."""
+
+    # type = str
+
+    type = abc.Mapping
+    name = "dbjson"
+    # na_value = pd.NA  # TODO: StringDtype is libmissing.NA
+
+    na_value: Mapping[str, Any] = UserDict()
+    # _is_numeric = False
+    # _is_boolean = False
+
+    @classmethod
+    def construct_array_type(cls):
+        """Return the array type associated with this dtype."""
+        return JSONArray
+
+    # @staticmethod
+    # def __from_arrow__(
+    #     array: Union[pyarrow.Array, pyarrow.ChunkedArray]
+    # ) -> "JSONArray":
+    #     """Convert to JSONArray from an Arrow array.
+
+    #     See:
+    #     https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+    #     """
+    #     if isinstance(array, pyarrow.Array):
+    #         chunks = [array]
+    #     else:
+    #         chunks = array.chunks
+
+    #     results = []
+    #     for arr in chunks:
+    #         # convert chunk by chunk to numpy and concatenate then, to avoid
+    #         # overflow for large string data when concatenating the pyarrow arrays
+    #         arr = arr.to_numpy(zero_copy_only=False)
+    #         arr = ensure_string_array(arr, na_value=pandas.NA)
+    #         results.append(arr)
+
+    #     if len(chunks) == 0:
+    #         arr = numpy.array([], dtype=str)
+    #     else:
+    #         arr = numpy.concatenate(results)
+
+    #     return JSONArray(arr)
+
+    #     # TODO: codes from StringDtype
+    #     # # Bypass validation inside StringArray constructor, see GH#47781
+    #     # new_string_array = StringArray.__new__(StringArray)
+    #     # NDArrayBacked.__init__(
+    #     #     new_string_array,
+    #     #     arr,
+    #     #     StringDtype(storage="python"),
+    #     # )
+    #     # return new_string_array
+
+
+class JSONArray(pd.api.extensions.ExtensionArray):
+    """Extension array containing JSON data."""
+
+    dtype = JSONDtype()
+    __array_priority__ = 1000
+
+    def __init__(self, values, dtype=None, copy=False) -> None:
+        for val in values:
+            if not isinstance(val, self.dtype.type):
+                raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}")
+        self.data = values
+
+        # Some aliases for common attribute names to ensure pandas supports
+        # these
+        self._items = self._data = self.data
+        # those aliases are currently not working due to assumptions
+        # in internal code (GH-20735)
+        # self._values = self.values = self.data
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        return cls(scalars)
+
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls([UserDict(x) for x in values if x != ()])
+
+    def __getitem__(self, item):
+        if isinstance(item, tuple):
+            item = unpack_tuple_and_ellipses(item)
+
+        if isinstance(item, numbers.Integral):
+            return self.data[item]
+        elif isinstance(item, slice) and item == slice(None):
+            # Make sure we get a view
+            return type(self)(self.data)
+        elif isinstance(item, slice):
+            # slice
+            return type(self)(self.data[item])
+        elif not is_list_like(item):
+            # e.g. "foo" or 2.5
+            # exception message copied from numpy
+            raise IndexError(
+                r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                r"(`None`) and integer or boolean arrays are valid indices"
+            )
+        else:
+            item = pd.api.indexers.check_array_indexer(self, item)
+            if is_bool_dtype(item.dtype):
+                return type(self)._from_sequence(
+                    [x for x, m in zip(self, item) if m], dtype=self.dtype
+                )
+            # integer
+            return type(self)([self.data[i] for i in item])
+
+    def __setitem__(self, key, value) -> None:
+        if isinstance(key, numbers.Integral):
+            self.data[key] = value
+        else:
+            if not isinstance(value, (type(self), abc.Sequence)):
+                # broadcast value
+                value = itertools.cycle([value])
+
+            if isinstance(key, np.ndarray) and key.dtype == "bool":
+                # masking
+                for i, (k, v) in enumerate(zip(key, value)):
+                    if k:
+                        assert isinstance(v, self.dtype.type)
+                        self.data[i] = v
+            else:
+                for k, v in zip(key, value):
+                    assert isinstance(v, self.dtype.type)
+                    self.data[k] = v
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __eq__(self, other):
+        return NotImplemented
+
+    def __ne__(self, other):
+        return NotImplemented
+
+    def __array__(self, dtype=None, copy=None):
+        if dtype is None:
+            dtype = object
+        if dtype == object:
+            # on py38 builds it looks like numpy is inferring to a non-1D array
+            return construct_1d_object_array_from_listlike(list(self))
+        return np.asarray(self.data, dtype=dtype)
+
+    @property
+    def nbytes(self) -> int:
+        return sys.getsizeof(self.data)
+
+    def isna(self):
+        return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
+
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # re-implement here, since NumPy has trouble setting
+        # sized objects like UserDicts into scalar slots of
+        # an ndarary.
+        indexer = np.asarray(indexer)
+        msg = (
+            "Index is out of bounds or cannot do a "
+            "non-empty take from an empty array."
+        )
+
+        if allow_fill:
+            # Do not allow any custom na_value
+            if fill_value is None:
+                fill_value = self.dtype.na_value
+            # bounds check
+            if (indexer < -1).any():
+                raise ValueError
+            try:
+                output = [
+                    self.data[loc] if loc != -1 else fill_value for loc in indexer
+                ]
+            except IndexError as err:
+                raise IndexError(msg) from err
+        else:
+            try:
+                output = [self.data[loc] for loc in indexer]
+            except IndexError as err:
+                raise IndexError(msg) from err
+
+        return type(self)._from_sequence(output, dtype=self.dtype)
+
+    def copy(self):
+        return type(self)(self.data[:])
+
+    def astype(self, dtype, copy=True):
+        # NumPy has issues when all the dicts are the same length.
+        # np.array([UserDict(...), UserDict(...)]) fails,
+        # but np.array([{...}, {...}]) works, so cast.
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
+        # needed to add this check for the Series constructor
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif isinstance(dtype, StringDtype):
+            value = self.astype(str)  # numpy doesn't like nested dicts
+            arr_cls = dtype.construct_array_type()
+            return arr_cls._from_sequence(value, dtype=dtype, copy=False)
+        elif not copy:
+            return np.asarray([dict(x) for x in self], dtype=dtype)
+        else:
+            return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
+
+    def unique(self):
+        # Parent method doesn't work since np.array will try to infer
+        # a 2-dim object.
+        return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = list(itertools.chain.from_iterable(x.data for x in to_concat))
+        return cls(data)
+
+    def _values_for_factorize(self):
+        frozen = self._values_for_argsort()
+        if len(frozen) == 0:
+            # factorize_array expects 1-d array, this is a len-0 2-d array.
+            frozen = frozen.ravel()
+        return frozen, ()
+
+    def _values_for_argsort(self):
+        # Bypass NumPy's shape inference to get a (N,) array of tuples.
+        frozen = [tuple(x.items()) for x in self]
+        return construct_1d_object_array_from_listlike(frozen)
+
+    def _pad_or_backfill(self, *, method, limit=None, copy=True):
+        # GH#56616 - test EA method without limit_area argument
+        return super()._pad_or_backfill(method=method, limit=limit, copy=copy)