Skip to content

Commit 5c87a15

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into share-datetime-parsing-format-paths
2 parents bb6a735 + d800024 commit 5c87a15

File tree

32 files changed

+435
-232
lines changed

32 files changed

+435
-232
lines changed

asv_bench/benchmarks/array.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ def time_setitem_list(self, multiple_chunks):
9090
def time_setitem_slice(self, multiple_chunks):
9191
self.array[::10] = "foo"
9292

93+
def time_setitem_null_slice(self, multiple_chunks):
94+
self.array[:] = "foo"
95+
9396
def time_tolist(self, multiple_chunks):
9497
self.array.tolist()
9598

asv_bench/benchmarks/reshape.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,17 @@
1515

1616

1717
class Melt:
18-
def setup(self):
19-
self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"])
20-
self.df["id1"] = np.random.randint(0, 10, 10000)
21-
self.df["id2"] = np.random.randint(100, 1000, 10000)
18+
params = ["float64", "Float64"]
19+
param_names = ["dtype"]
20+
21+
def setup(self, dtype):
22+
self.df = DataFrame(
23+
np.random.randn(100_000, 3), columns=["A", "B", "C"], dtype=dtype
24+
)
25+
self.df["id1"] = pd.Series(np.random.randint(0, 10, 10000))
26+
self.df["id2"] = pd.Series(np.random.randint(100, 1000, 10000))
2227

23-
def time_melt_dataframe(self):
28+
def time_melt_dataframe(self, dtype):
2429
melt(self.df, id_vars=["id1", "id2"])
2530

2631

ci/deps/actions-38-downstream_compat.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ dependencies:
5656
- zstandard
5757

5858
# downstream packages
59-
- aiobotocore
6059
- botocore
6160
- cftime
6261
- dask

doc/source/whatsnew/v2.0.0.rst

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,13 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
3838
* :func:`read_csv`
3939
* :func:`read_excel`
4040
* :func:`read_sql`
41+
* :func:`read_sql_query`
42+
* :func:`read_sql_table`
4143

4244
Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
4345
to select the nullable dtypes implementation.
4446

45-
* :func:`read_csv` (with ``engine="pyarrow"``)
47+
* :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
4648
* :func:`read_excel`
4749
* :func:`read_parquet`
4850
* :func:`read_orc`
@@ -394,7 +396,7 @@ If installed, we now require:
394396
+-----------------+-----------------+----------+---------+
395397
| Package | Minimum Version | Required | Changed |
396398
+=================+=================+==========+=========+
397-
| mypy (dev) | 0.990 | | X |
399+
| mypy (dev) | 0.991 | | X |
398400
+-----------------+-----------------+----------+---------+
399401
| python-dateutil | 2.8.2 | X | X |
400402
+-----------------+-----------------+----------+---------+
@@ -736,6 +738,7 @@ Performance improvements
736738
- Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`)
737739
- Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`)
738740
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
741+
- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
739742
- Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
740743
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
741744
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
@@ -748,6 +751,7 @@ Performance improvements
748751
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
749752
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
750753
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
754+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`)
751755
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
752756
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
753757
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
@@ -831,8 +835,10 @@ Interval
831835

832836
Indexing
833837
^^^^^^^^
838+
- Bug in :meth:`DataFrame.__setitem__` raising when indexer is a :class:`DataFrame` with ``boolean`` dtype (:issue:`47125`)
834839
- Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
835840
- Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`)
841+
- Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`)
836842
- Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`)
837843
- Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when right hand side is :class:`DataFrame` with :class:`MultiIndex` columns (:issue:`49121`)
838844
- Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
@@ -870,12 +876,13 @@ I/O
870876
- Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`)
871877
- Improved error message in :func:`read_excel` by including the offending sheet name when an exception is raised while reading a file (:issue:`48706`)
872878
- Bug when a pickling a subset PyArrow-backed data that would serialize the entire data instead of the subset (:issue:`42600`)
879+
- Bug in :func:`read_sql_query` ignoring ``dtype`` argument when ``chunksize`` is specified and result is empty (:issue:`50245`)
873880
- Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`)
874881
- Bug in displaying ``string`` dtypes not showing storage option (:issue:`50099`)
875882
- Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
876883
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
877884
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
878-
-
885+
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
879886

880887
Period
881888
^^^^^^
@@ -906,6 +913,7 @@ Reshaping
906913
^^^^^^^^^
907914
- Bug in :meth:`DataFrame.pivot_table` raising ``TypeError`` for nullable dtype and ``margins=True`` (:issue:`48681`)
908915
- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` unstacking wrong level of :class:`MultiIndex` when :class:`MultiIndex` has mixed names (:issue:`48763`)
916+
- Bug in :meth:`DataFrame.melt` losing extension array dtype (:issue:`41570`)
909917
- Bug in :meth:`DataFrame.pivot` not respecting ``None`` as column name (:issue:`48293`)
910918
- Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`)
911919
- Bug in :meth:`DataFrame.pivot_table` raising ``ValueError`` with parameter ``margins=True`` when result is an empty :class:`DataFrame` (:issue:`49240`)

environment.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ dependencies:
6060
- zstandard
6161

6262
# downstream packages
63-
- aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild
6463
- dask-core
6564
- seaborn-base
6665

@@ -69,7 +68,7 @@ dependencies:
6968
- flask
7069

7170
# benchmarks
72-
- asv
71+
- asv>=0.5.1
7372

7473
# The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms.
7574
- c-compiler
@@ -81,7 +80,7 @@ dependencies:
8180
- flake8=6.0.0
8281
- flake8-bugbear=22.7.1 # used by flake8, find likely bugs
8382
- isort>=5.2.1 # check that imports are in the right order
84-
- mypy=0.990
83+
- mypy=0.991
8584
- pre-commit>=2.15.0
8685
- pycodestyle # used by flake8
8786
- pyupgrade

pandas/_libs/src/ujson/python/objToJSON.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,18 @@ static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
332332
return PyBytes_AS_STRING(obj);
333333
}
334334

335-
static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
335+
static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
336336
size_t *_outLen) {
337-
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
337+
char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
338+
(Py_ssize_t *)_outLen);
339+
if (encoded == NULL) {
340+
/* Something went wrong.
341+
Set errorMsg(to tell encoder to stop),
342+
and let Python exception propagate. */
343+
JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
344+
enc->errorMsg = "Encoding failed.";
345+
}
346+
return encoded;
338347
}
339348

340349
/* JSON callback. returns a char* and mutates the pointer to *len */

pandas/_libs/tslib.pyi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def format_array_from_datetime(
1212
reso: int = ..., # NPY_DATETIMEUNIT
1313
) -> npt.NDArray[np.object_]: ...
1414
def array_with_unit_to_datetime(
15-
values: np.ndarray,
15+
values: npt.NDArray[np.object_],
1616
unit: str,
1717
errors: str = ...,
1818
) -> tuple[np.ndarray, tzinfo | None]: ...

pandas/_libs/tslib.pyx

Lines changed: 5 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import_datetime()
1818

1919
cimport numpy as cnp
2020
from numpy cimport (
21-
float64_t,
2221
int64_t,
2322
ndarray,
2423
)
@@ -231,7 +230,7 @@ def format_array_from_datetime(
231230

232231

233232
def array_with_unit_to_datetime(
234-
ndarray values,
233+
ndarray[object] values,
235234
str unit,
236235
str errors="coerce"
237236
):
@@ -266,70 +265,24 @@ def array_with_unit_to_datetime(
266265
cdef:
267266
Py_ssize_t i, n=len(values)
268267
int64_t mult
269-
int prec = 0
270-
ndarray[float64_t] fvalues
271268
bint is_ignore = errors=="ignore"
272269
bint is_coerce = errors=="coerce"
273270
bint is_raise = errors=="raise"
274-
bint need_to_iterate = True
275271
ndarray[int64_t] iresult
276272
ndarray[object] oresult
277-
ndarray mask
278273
object tz = None
279274

280275
assert is_ignore or is_coerce or is_raise
281276

282277
if unit == "ns":
283-
if issubclass(values.dtype.type, (np.integer, np.float_)):
284-
result = values.astype("M8[ns]", copy=False)
285-
else:
286-
result, tz = array_to_datetime(
287-
values.astype(object, copy=False),
288-
errors=errors,
289-
)
278+
result, tz = array_to_datetime(
279+
values.astype(object, copy=False),
280+
errors=errors,
281+
)
290282
return result, tz
291283

292284
mult, _ = precision_from_unit(unit)
293285

294-
if is_raise:
295-
# try a quick conversion to i8/f8
296-
# if we have nulls that are not type-compat
297-
# then need to iterate
298-
299-
if values.dtype.kind in ["i", "f", "u"]:
300-
iresult = values.astype("i8", copy=False)
301-
# fill missing values by comparing to NPY_NAT
302-
mask = iresult == NPY_NAT
303-
# Trying to Convert NaN to integer results in undefined
304-
# behaviour, so handle it explicitly (see GH #48705)
305-
if values.dtype.kind == "f":
306-
mask |= values != values
307-
iresult[mask] = 0
308-
fvalues = iresult.astype("f8") * mult
309-
need_to_iterate = False
310-
311-
if not need_to_iterate:
312-
# check the bounds
313-
if (fvalues < Timestamp.min.value).any() or (
314-
(fvalues > Timestamp.max.value).any()
315-
):
316-
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
317-
318-
if values.dtype.kind in ["i", "u"]:
319-
result = (iresult * mult).astype("M8[ns]")
320-
321-
elif values.dtype.kind == "f":
322-
fresult = (values * mult).astype("f8")
323-
fresult[mask] = 0
324-
if prec:
325-
fresult = round(fresult, prec)
326-
result = fresult.astype("M8[ns]", copy=False)
327-
328-
iresult = result.view("i8")
329-
iresult[mask] = NPY_NAT
330-
331-
return result, tz
332-
333286
result = np.empty(n, dtype="M8[ns]")
334287
iresult = result.view("i8")
335288

pandas/_libs/tslibs/np_datetime.pyx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -312,10 +312,10 @@ cpdef ndarray astype_overflowsafe(
312312
"""
313313
if values.descr.type_num == dtype.type_num == cnp.NPY_DATETIME:
314314
# i.e. dtype.kind == "M"
315-
pass
315+
dtype_name = "datetime64"
316316
elif values.descr.type_num == dtype.type_num == cnp.NPY_TIMEDELTA:
317317
# i.e. dtype.kind == "m"
318-
pass
318+
dtype_name = "timedelta64"
319319
else:
320320
raise TypeError(
321321
"astype_overflowsafe values.dtype and dtype must be either "
@@ -326,14 +326,14 @@ cpdef ndarray astype_overflowsafe(
326326
NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype)
327327
NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype)
328328

329-
if (
330-
from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
331-
or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
332-
):
329+
if from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
330+
raise TypeError(f"{dtype_name} values must have a unit specified")
331+
332+
if to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
333333
# without raising explicitly here, we end up with a SystemError
334334
# built-in function [...] returned a result with an error
335335
raise ValueError(
336-
"datetime64/timedelta64 values and dtype must have a unit specified"
336+
f"{dtype_name} dtype must have a unit specified"
337337
)
338338

339339
if from_unit == to_unit:

0 commit comments

Comments
 (0)