Skip to content

Commit 6cb5c78

Browse files
committed
Copy JSONDtype and JSONArray from tests/extension/json and their tests
1 parent f2bdeea commit 6cb5c78

File tree

5 files changed

+974
-0
lines changed

5 files changed

+974
-0
lines changed

db_dtypes/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import pyarrow.compute
2929

3030
from db_dtypes import core
31+
from db_dtypes.json import JSONArray, JSONDtype
3132
from db_dtypes.version import __version__
3233

3334
date_dtype_name = "dbdate"
@@ -341,6 +342,8 @@ def __sub__(self, other):
341342
"__version__",
342343
"DateArray",
343344
"DateDtype",
345+
"JSONDtype",
346+
"JSONArray",
344347
"TimeArray",
345348
"TimeDtype",
346349
]

db_dtypes/json.py

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from collections import UserDict, abc
18+
import itertools
19+
import numbers
20+
import string
21+
import sys
22+
from typing import TYPE_CHECKING, Any
23+
24+
import numpy as np
25+
import pandas as pd
26+
from pandas.api.extensions import ExtensionArray, ExtensionDtype
27+
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
28+
from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype
29+
from pandas.core.indexers import unpack_tuple_and_ellipses
30+
31+
if TYPE_CHECKING:
32+
from collections.abc import Mapping
33+
34+
from pandas._typing import type_t
35+
36+
37+
@pd.api.extensions.register_extension_dtype
38+
class JSONDtype(pd.api.extensions.ExtensionDtype):
39+
"""Extension dtype for JSON data."""
40+
41+
# type = str
42+
43+
type = abc.Mapping
44+
name = "dbjson"
45+
# na_value = pd.NA # TODO: StringDtype is libmissing.NA
46+
47+
na_value: Mapping[str, Any] = UserDict()
48+
# _is_numeric = False
49+
# _is_boolean = False
50+
51+
@classmethod
52+
def construct_array_type(cls):
53+
"""Return the array type associated with this dtype."""
54+
return JSONArray
55+
56+
# @staticmethod
57+
# def __from_arrow__(
58+
# array: Union[pyarrow.Array, pyarrow.ChunkedArray]
59+
# ) -> "JSONArray":
60+
# """Convert to JSONArray from an Arrow array.
61+
62+
# See:
63+
# https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
64+
# """
65+
# if isinstance(array, pyarrow.Array):
66+
# chunks = [array]
67+
# else:
68+
# chunks = array.chunks
69+
70+
# results = []
71+
# for arr in chunks:
72+
# # convert chunk by chunk to numpy and concatenate then, to avoid
73+
# # overflow for large string data when concatenating the pyarrow arrays
74+
# arr = arr.to_numpy(zero_copy_only=False)
75+
# arr = ensure_string_array(arr, na_value=pandas.NA)
76+
# results.append(arr)
77+
78+
# if len(chunks) == 0:
79+
# arr = numpy.array([], dtype=str)
80+
# else:
81+
# arr = numpy.concatenate(results)
82+
83+
# return JSONArray(arr)
84+
85+
# # TODO: codes from StringDtype
86+
# # # Bypass validation inside StringArray constructor, see GH#47781
87+
# # new_string_array = StringArray.__new__(StringArray)
88+
# # NDArrayBacked.__init__(
89+
# # new_string_array,
90+
# # arr,
91+
# # StringDtype(storage="python"),
92+
# # )
93+
# # return new_string_array
94+
95+
96+
class JSONArray(pd.api.extensions.ExtensionArray):
97+
"""Extension array containing JSON data."""
98+
99+
dtype = JSONDtype()
100+
__array_priority__ = 1000
101+
102+
def __init__(self, values, dtype=None, copy=False) -> None:
103+
for val in values:
104+
if not isinstance(val, self.dtype.type):
105+
raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}")
106+
self.data = values
107+
108+
# Some aliases for common attribute names to ensure pandas supports
109+
# these
110+
self._items = self._data = self.data
111+
# those aliases are currently not working due to assumptions
112+
# in internal code (GH-20735)
113+
# self._values = self.values = self.data
114+
115+
@classmethod
116+
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
117+
return cls(scalars)
118+
119+
@classmethod
120+
def _from_factorized(cls, values, original):
121+
return cls([UserDict(x) for x in values if x != ()])
122+
123+
def __getitem__(self, item):
124+
if isinstance(item, tuple):
125+
item = unpack_tuple_and_ellipses(item)
126+
127+
if isinstance(item, numbers.Integral):
128+
return self.data[item]
129+
elif isinstance(item, slice) and item == slice(None):
130+
# Make sure we get a view
131+
return type(self)(self.data)
132+
elif isinstance(item, slice):
133+
# slice
134+
return type(self)(self.data[item])
135+
elif not is_list_like(item):
136+
# e.g. "foo" or 2.5
137+
# exception message copied from numpy
138+
raise IndexError(
139+
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
140+
r"(`None`) and integer or boolean arrays are valid indices"
141+
)
142+
else:
143+
item = pd.api.indexers.check_array_indexer(self, item)
144+
if is_bool_dtype(item.dtype):
145+
return type(self)._from_sequence(
146+
[x for x, m in zip(self, item) if m], dtype=self.dtype
147+
)
148+
# integer
149+
return type(self)([self.data[i] for i in item])
150+
151+
def __setitem__(self, key, value) -> None:
152+
if isinstance(key, numbers.Integral):
153+
self.data[key] = value
154+
else:
155+
if not isinstance(value, (type(self), abc.Sequence)):
156+
# broadcast value
157+
value = itertools.cycle([value])
158+
159+
if isinstance(key, np.ndarray) and key.dtype == "bool":
160+
# masking
161+
for i, (k, v) in enumerate(zip(key, value)):
162+
if k:
163+
assert isinstance(v, self.dtype.type)
164+
self.data[i] = v
165+
else:
166+
for k, v in zip(key, value):
167+
assert isinstance(v, self.dtype.type)
168+
self.data[k] = v
169+
170+
def __len__(self) -> int:
171+
return len(self.data)
172+
173+
def __eq__(self, other):
174+
return NotImplemented
175+
176+
def __ne__(self, other):
177+
return NotImplemented
178+
179+
def __array__(self, dtype=None, copy=None):
180+
if dtype is None:
181+
dtype = object
182+
if dtype == object:
183+
# on py38 builds it looks like numpy is inferring to a non-1D array
184+
return construct_1d_object_array_from_listlike(list(self))
185+
return np.asarray(self.data, dtype=dtype)
186+
187+
@property
188+
def nbytes(self) -> int:
189+
return sys.getsizeof(self.data)
190+
191+
def isna(self):
192+
return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
193+
194+
def take(self, indexer, allow_fill=False, fill_value=None):
195+
# re-implement here, since NumPy has trouble setting
196+
# sized objects like UserDicts into scalar slots of
197+
# an ndarary.
198+
indexer = np.asarray(indexer)
199+
msg = (
200+
"Index is out of bounds or cannot do a "
201+
"non-empty take from an empty array."
202+
)
203+
204+
if allow_fill:
205+
# Do not allow any custom na_value
206+
if fill_value is None:
207+
fill_value = self.dtype.na_value
208+
# bounds check
209+
if (indexer < -1).any():
210+
raise ValueError
211+
try:
212+
output = [
213+
self.data[loc] if loc != -1 else fill_value for loc in indexer
214+
]
215+
except IndexError as err:
216+
raise IndexError(msg) from err
217+
else:
218+
try:
219+
output = [self.data[loc] for loc in indexer]
220+
except IndexError as err:
221+
raise IndexError(msg) from err
222+
223+
return type(self)._from_sequence(output, dtype=self.dtype)
224+
225+
def copy(self):
226+
return type(self)(self.data[:])
227+
228+
def astype(self, dtype, copy=True):
229+
# NumPy has issues when all the dicts are the same length.
230+
# np.array([UserDict(...), UserDict(...)]) fails,
231+
# but np.array([{...}, {...}]) works, so cast.
232+
from pandas.core.arrays.string_ import StringDtype
233+
234+
dtype = pandas_dtype(dtype)
235+
# needed to add this check for the Series constructor
236+
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
237+
if copy:
238+
return self.copy()
239+
return self
240+
elif isinstance(dtype, StringDtype):
241+
value = self.astype(str) # numpy doesn't like nested dicts
242+
arr_cls = dtype.construct_array_type()
243+
return arr_cls._from_sequence(value, dtype=dtype, copy=False)
244+
elif not copy:
245+
return np.asarray([dict(x) for x in self], dtype=dtype)
246+
else:
247+
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
248+
249+
def unique(self):
250+
# Parent method doesn't work since np.array will try to infer
251+
# a 2-dim object.
252+
return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
253+
254+
@classmethod
255+
def _concat_same_type(cls, to_concat):
256+
data = list(itertools.chain.from_iterable(x.data for x in to_concat))
257+
return cls(data)
258+
259+
def _values_for_factorize(self):
260+
frozen = self._values_for_argsort()
261+
if len(frozen) == 0:
262+
# factorize_array expects 1-d array, this is a len-0 2-d array.
263+
frozen = frozen.ravel()
264+
return frozen, ()
265+
266+
def _values_for_argsort(self):
267+
# Bypass NumPy's shape inference to get a (N,) array of tuples.
268+
frozen = [tuple(x.items()) for x in self]
269+
return construct_1d_object_array_from_listlike(frozen)
270+
271+
def _pad_or_backfill(self, *, method, limit=None, copy=True):
272+
# GH#56616 - test EA method without limit_area argument
273+
return super()._pad_or_backfill(method=method, limit=limit, copy=copy)

0 commit comments

Comments
 (0)