Skip to content

Commit 5df28eb

Browse files
committed
supports all types except Array
1 parent 508cdea commit 5df28eb

File tree

2 files changed

+93
-99
lines changed

2 files changed

+93
-99
lines changed

db_dtypes/json.py

Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,16 @@
2323
import pandas.core.dtypes.common as common
2424
import pandas.core.indexers as indexers
2525
import pyarrow as pa
26+
import pyarrow.compute
2627

28+
ARROW_CMP_FUNCS = {
29+
"eq": pyarrow.compute.equal,
30+
"ne": pyarrow.compute.not_equal,
31+
"lt": pyarrow.compute.less,
32+
"gt": pyarrow.compute.greater,
33+
"le": pyarrow.compute.less_equal,
34+
"ge": pyarrow.compute.greater_equal,
35+
}
2736

2837
@pd.api.extensions.register_extension_dtype
2938
class JSONDtype(pd.api.extensions.ExtensionDtype):
@@ -130,7 +139,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
130139
result = []
131140
for scalar in scalars:
132141
result.append(JSONArray._serialize_json(scalar))
133-
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
142+
return cls(pa.array(result, type=pa.string(), from_pandas=True))
134143

135144
@classmethod
136145
def _from_sequence_of_strings(
@@ -143,7 +152,7 @@ def _from_sequence_of_strings(
143152
def _concat_same_type(cls, to_concat) -> JSONArray:
144153
"""Concatenate multiple JSONArray."""
145154
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
146-
arr = pa.chunked_array(chunks, type=pa.large_string())
155+
arr = pa.chunked_array(chunks, type=pa.string())
147156
return cls(arr)
148157

149158
@classmethod
@@ -154,7 +163,7 @@ def _from_factorized(cls, values, original):
154163
@staticmethod
155164
def _serialize_json(value):
156165
"""A static method that converts a JSON value into a string representation."""
157-
if isinstance(value, str) or pd.isna(value):
166+
if pd.isna(value):
158167
return value
159168
else:
160169
# `sort_keys=True` sorts dictionary keys before serialization, making
@@ -174,17 +183,10 @@ def dtype(self) -> JSONDtype:
174183
"""An instance of JSONDtype"""
175184
return self._dtype
176185

177-
def __contains__(self, key) -> bool:
178-
"""Return for `item in self`."""
179-
return super().__contains__(JSONArray._serialize_json(key))
180-
181-
def insert(self, loc: int, item) -> JSONArray:
182-
"""
183-
Make new ExtensionArray inserting new item at location. Follows Python
184-
list.append semantics for negative values.
185-
"""
186-
val = JSONArray._serialize_json(item)
187-
return super().insert(loc, val)
186+
def _cmp_method(self, other, op):
187+
pc_func = ARROW_CMP_FUNCS[op.__name__]
188+
result = pc_func(self._pa_array, self._box_pa(other))
189+
return arrays.ArrowExtensionArray(result)
188190

189191
def __getitem__(self, item):
190192
"""Select a subset of self."""
@@ -244,3 +246,48 @@ def __iter__(self):
244246
yield self._dtype.na_value
245247
else:
246248
yield val
249+
250+
def _reduce(
251+
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
252+
):
253+
"""Return a scalar result of performing the reduction operation."""
254+
if name in ["min", "max"]:
255+
raise TypeError("JSONArray does not support min/max reducntion.")
256+
super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
257+
258+
def __array__(
259+
self, dtype = None, copy = None
260+
) -> np.ndarray:
261+
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
262+
return self.to_numpy(dtype=dtype)
263+
264+
def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray:
265+
dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna)
266+
pa_type = self._pa_array.type
267+
if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type):
268+
data = self
269+
else:
270+
data = self.fillna(na_value)
271+
result = np.array(list(data), dtype=dtype)
272+
273+
if data._hasna:
274+
result[data.isna()] = na_value
275+
return result
276+
277+
def _to_numpy_dtype_inference(
278+
self, dtype, na_value, hasna
279+
):
280+
if dtype is not None:
281+
dtype = np.dtype(dtype)
282+
283+
if dtype is None or not hasna:
284+
na_value = self.dtype.na_value
285+
elif dtype.kind == "f": # type: ignore[union-attr]
286+
na_value = np.nan
287+
elif dtype.kind == "M": # type: ignore[union-attr]
288+
na_value = np.datetime64("nat")
289+
elif dtype.kind == "m": # type: ignore[union-attr]
290+
na_value = np.timedelta64("nat")
291+
else:
292+
na_value = self.dtype.na_value
293+
return dtype, na_value

tests/compliance/json/test_json_compliance.py

Lines changed: 32 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
2222
from pandas.tests.extension import base
2323
import pytest
24+
import db_dtypes
2425

2526

2627
class TestJSONArray(base.ExtensionTests):
@@ -111,7 +112,7 @@ def test_compare_scalar(self, data, comparison_op, request):
111112
super().test_compare_scalar(data, comparison_op)
112113

113114
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
114-
return op_name in ["min", "max"]
115+
return False
115116

116117
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
117118
dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
@@ -125,43 +126,6 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
125126
def test_searchsorted(self, data_for_sorting, as_series):
126127
super().test_searchsorted(self, data_for_sorting, as_series)
127128

128-
def test_astype_str(self, data):
129-
# Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
130-
result = pd.Series(data[:5]).astype(str)
131-
expected = pd.Series(
132-
[json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
133-
)
134-
tm.assert_series_equal(result, expected)
135-
136-
@pytest.mark.parametrize(
137-
"nullable_string_dtype",
138-
[
139-
"string[python]",
140-
"string[pyarrow]",
141-
],
142-
)
143-
def test_astype_string(self, data, nullable_string_dtype):
144-
# Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
145-
result = pd.Series(data[:5]).astype(nullable_string_dtype)
146-
expected = pd.Series(
147-
[json.dumps(x, sort_keys=True) for x in data[:5]],
148-
dtype=nullable_string_dtype,
149-
)
150-
tm.assert_series_equal(result, expected)
151-
152-
def test_array_interface(self, data):
153-
result = np.array(data)
154-
# Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
155-
assert result[0] == json.dumps(data[0])
156-
157-
result = np.array(data, dtype=object)
158-
# Use `json.dumps(x)` instead of passing `x` directly to the super method.
159-
expected = np.array([json.dumps(x) for x in data], dtype=object)
160-
if expected.ndim > 1:
161-
# nested data, explicitly construct as 1D
162-
expected = construct_1d_object_array_from_listlike(list(data))
163-
tm.assert_numpy_array_equal(result, expected)
164-
165129
@pytest.mark.xfail(reason="Setting a dict as a scalar")
166130
def test_fillna_series(self):
167131
"""We treat dictionaries as a mapping in fillna, not a scalar."""
@@ -212,7 +176,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype):
212176
expected = pd.Series([scalar], index=["foo"], dtype=dtype)
213177
tm.assert_series_equal(result, expected)
214178

215-
# Patching `json.dumps` to base.BaseSetitemTests because pandas' internals has
179+
# Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals
216180
# has trouble setting sequences of values into scalar positions.
217181

218182
@pytest.mark.parametrize(
@@ -228,8 +192,8 @@ def test_setitem_integer_array(self, data, idx, box_in_series):
228192
arr = pd.Series(arr)
229193
expected = pd.Series(expected)
230194

231-
# Use json.dumps(arr[0]) instead of passing arr[0] directly to the super method.
232-
arr[idx] = json.dumps(arr[0])
195+
# Use `[arr[0]] * len()` instead of passing `arr[0]` directly to the super method.
196+
arr[idx] = [arr[0]] * len(arr[idx])
233197
tm.assert_equal(arr, expected)
234198

235199
@pytest.mark.parametrize("setter", ["loc", None])
@@ -243,60 +207,54 @@ def test_setitem_mask_broadcast(self, data, setter):
243207
else: # __setitem__
244208
target = ser
245209

246-
# Use json.dumps(data[10]) instead of passing data[10] directly to the super method.
247-
target[mask] = json.dumps(data[10])
210+
# Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method.
211+
target[mask] = [data[10]] * len(target[mask])
248212
assert ser[0] == data[10]
249213
assert ser[1] == data[10]
250214

251215
def test_setitem_loc_scalar_mixed(self, data):
252216
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
253-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
254-
df.loc[0, "B"] = json.dumps(data[1])
217+
# Use `[data[1]]` instead of passing `data[1]` directly to the super method.
218+
df.loc[0, "B"] = [data[1]]
255219
assert df.loc[0, "B"] == data[1]
256220

221+
@pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
257222
def test_setitem_loc_scalar_single(self, data):
258-
df = pd.DataFrame({"B": data})
259-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
260-
df.loc[10, "B"] = json.dumps(data[1])
261-
assert df.loc[10, "B"] == data[1]
223+
super().test_setitem_loc_scalar_single(data)
262224

263225
def test_setitem_loc_iloc_slice(self, data):
264226
arr = data[:5].copy()
265227
s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
266228
expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
267229

268230
result = s.copy()
269-
# Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
270-
result.iloc[:3] = json.dumps(data[0])
231+
# Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
232+
result.iloc[:3] = [data[0]] * len(result.iloc[:3])
271233
tm.assert_equal(result, expected)
272234

273235
result = s.copy()
274-
result.loc[:"c"] = json.dumps(data[0])
236+
result.loc[:"c"] = [data[0]] * len(result.loc[:"c"])
275237
tm.assert_equal(result, expected)
276238

239+
@pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
277240
def test_setitem_iloc_scalar_single(self, data):
278-
df = pd.DataFrame({"B": data})
279-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
280-
df.iloc[10, 0] = json.dumps(data[1])
281-
assert df.loc[10, "B"] == data[1]
241+
super().test_setitem_iloc_scalar_single(data)
282242

283243
def test_setitem_iloc_scalar_mixed(self, data):
284244
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
285-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
286-
df.iloc[0, 1] = json.dumps(data[1])
245+
# Use `[data[1]] * len()` instead of passing `data[1]` directly to the super method.
246+
df.iloc[0, 1] = [data[1]] * len(df.iloc[0, 1])
287247
assert df.loc[0, "B"] == data[1]
288248

289-
@pytest.mark.xfail(reaons="eq not implemented for <class 'dict'>")
249+
@pytest.mark.xfail(reason="eq not implemented for <class 'dict'>")
290250
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
291251
super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
292252

293253
@pytest.mark.parametrize("setter", ["loc", "iloc"])
254+
255+
@pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
294256
def test_setitem_scalar(self, data, setter):
295-
arr = pd.Series(data)
296-
setter = getattr(arr, setter)
297-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
298-
setter[0] = json.dumps(data[1])
299-
assert arr[0] == data[1]
257+
super().test_setitem_scalar(data, setter)
300258

301259
@pytest.mark.parametrize(
302260
"mask",
@@ -313,35 +271,24 @@ def test_setitem_mask(self, data, mask, box_in_series):
313271
if box_in_series:
314272
arr = pd.Series(arr)
315273
expected = pd.Series(expected)
316-
# Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
317-
arr[mask] = json.dumps(data[0])
274+
# Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
275+
arr[mask] = [data[0]] * len(arr[mask])
318276
tm.assert_equal(expected, arr)
319277

278+
@pytest.mark.xfail(reasons="Setting a `dict` to an expansion row is not supported")
320279
def test_setitem_with_expansion_row(self, data, na_value):
321-
df = pd.DataFrame({"data": data[:1]})
322-
323-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
324-
df.loc[1, "data"] = json.dumps(data[1])
325-
expected = pd.DataFrame({"data": data[:2]})
326-
tm.assert_frame_equal(df, expected)
327-
328-
# https://github.com/pandas-dev/pandas/issues/47284
329-
df.loc[2, "data"] = na_value
330-
expected = pd.DataFrame(
331-
{"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)}
332-
)
333-
tm.assert_frame_equal(df, expected)
280+
super().test_setitem_with_expansion_row(data, na_value)
334281

335282
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
336283
df = pd.DataFrame({"A": data, "B": data})
337-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
338-
df.iloc[10, 1] = json.dumps(data[1])
284+
# Use `[data[1]]` instead of passing `data[1]` directly to the super method.
285+
df.iloc[10, 1] = [data[1]]
339286
assert df.loc[10, "B"] == data[1]
340287

341288
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
342289
df = pd.DataFrame({"A": data, "B": data})
343-
# Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
344-
df.loc[10, "B"] = json.dumps(data[1])
290+
# Use `[data[1]]` instead of passing `data[1]` directly to the super method.
291+
df.loc[10, "B"] = [data[1]]
345292
assert df.loc[10, "B"] == data[1]
346293

347294
def test_setitem_slice(self, data, box_in_series):
@@ -351,8 +298,8 @@ def test_setitem_slice(self, data, box_in_series):
351298
arr = pd.Series(arr)
352299
expected = pd.Series(expected)
353300

354-
# Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
355-
arr[:3] = json.dumps(data[0])
301+
# Use `[data[0]] * 3` instead of passing `data[0]` directly to the super method.
302+
arr[:3] = [data[0]] * 3
356303
tm.assert_equal(arr, expected)
357304

358305
@pytest.mark.xfail(reason="only integer scalar arrays can be converted")

0 commit comments

Comments
 (0)