Skip to content

Commit b32c5a8

Browse files
committed
Merge branch 'master' into reduction_dtypes_II
2 parents a276624 + c3f0aac commit b32c5a8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+940
-321
lines changed

.github/workflows/asv-bot.yml

Lines changed: 0 additions & 78 deletions
This file was deleted.

.github/workflows/comment-commands.yml

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,83 @@ permissions:
1111
jobs:
1212
issue_assign:
1313
runs-on: ubuntu-22.04
14+
if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
15+
concurrency:
16+
group: ${{ github.actor }}-issue-assign
1417
steps:
15-
- if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
1618
run: |
1719
echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
1820
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
1921
preview_docs:
2022
runs-on: ubuntu-22.04
23+
if: github.event.issue.pull_request && github.event.comment.body == '/preview'
24+
concurrency:
25+
group: ${{ github.actor }}-preview-docs
2126
steps:
22-
- if: github.event.issue.pull_request && github.event.comment.body == '/preview'
2327
run: |
2428
if curl --output /dev/null --silent --head --fail "https://pandas.pydata.org/preview/${{ github.event.issue.number }}/"; then
2529
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "Website preview of this PR available at: https://pandas.pydata.org/preview/${{ github.event.issue.number }}/"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments
2630
else
2731
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "No preview found for PR #${{ github.event.issue.number }}. Did the docs build complete?"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments
2832
fi
33+
asv_run:
34+
runs-on: ubuntu-22.04
35+
# TODO: Support more benchmarking options later, against different branches, against self, etc
36+
if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark')
37+
defaults:
38+
run:
39+
shell: bash -el {0}
40+
env:
41+
ENV_FILE: environment.yml
42+
COMMENT: ${{github.event.comment.body}}
43+
44+
concurrency:
45+
# Set concurrency to prevent abuse(full runs are ~5.5 hours !!!)
46+
# each user can only run one concurrent benchmark bot at a time
47+
# We don't cancel in progress jobs, but if you want to benchmark multiple PRs, you're gonna have
48+
# to wait
49+
group: ${{ github.actor }}-asv
50+
cancel-in-progress: false
51+
52+
steps:
53+
- name: Checkout
54+
uses: actions/checkout@v3
55+
with:
56+
fetch-depth: 0
57+
58+
# Although asv sets up its own env, deps are still needed
59+
# during discovery process
60+
- name: Set up Conda
61+
uses: ./.github/actions/setup-conda
62+
63+
- name: Run benchmarks
64+
id: bench
65+
continue-on-error: true # asv will exit code 1 for regressions
66+
run: |
67+
# extracting the regex, see https://stackoverflow.com/a/36798723
68+
REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p")
69+
cd asv_bench
70+
asv check -E existing
71+
git remote add upstream https://github.com/pandas-dev/pandas.git
72+
git fetch upstream
73+
asv machine --yes
74+
asv continuous -f 1.1 -b $REGEX upstream/main HEAD
75+
echo 'BENCH_OUTPUT<<EOF' >> $GITHUB_ENV
76+
asv compare -f 1.1 upstream/main HEAD >> $GITHUB_ENV
77+
echo 'EOF' >> $GITHUB_ENV
78+
echo "REGEX=$REGEX" >> $GITHUB_ENV
79+
80+
- uses: actions/github-script@v6
81+
env:
82+
BENCH_OUTPUT: ${{env.BENCH_OUTPUT}}
83+
REGEX: ${{env.REGEX}}
84+
with:
85+
script: |
86+
const ENV_VARS = process.env
87+
const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
88+
github.rest.issues.createComment({
89+
issue_number: context.issue.number,
90+
owner: context.repo.owner,
91+
repo: context.repo.repo,
92+
body: '\nBenchmarks completed. View runner logs here.' + run_url + '\nRegex used: '+ 'regex ' + ENV_VARS["REGEX"] + '\n' + ENV_VARS["BENCH_OUTPUT"]
93+
})

.github/workflows/sdist.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,11 @@ jobs:
8181
run: |
8282
case "${{matrix.python-version}}" in
8383
3.8)
84-
pip install numpy==1.20.3 ;;
84+
pip install numpy==1.21.6 ;;
8585
3.9)
86-
pip install numpy==1.20.3 ;;
86+
pip install numpy==1.21.6 ;;
8787
3.10)
88-
pip install numpy==1.21.2 ;;
88+
pip install numpy==1.21.6 ;;
8989
3.11)
9090
pip install numpy==1.23.2 ;;
9191
esac

ci/deps/actions-38-minimum_versions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ dependencies:
1919

2020
# required dependencies
2121
- python-dateutil=2.8.2
22-
- numpy=1.20.3
22+
- numpy=1.21.6
2323
- pytz=2020.1
2424

2525
# optional dependencies

doc/source/getting_started/install.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ pandas requires the following dependencies.
260260
================================================================ ==========================
261261
Package Minimum supported version
262262
================================================================ ==========================
263-
`NumPy <https://numpy.org>`__ 1.20.3
263+
`NumPy <https://numpy.org>`__ 1.21.6
264264
`python-dateutil <https://dateutil.readthedocs.io/en/stable/>`__ 2.8.2
265265
`pytz <https://pypi.org/project/pytz/>`__ 2020.1
266266
================================================================ ==========================

doc/source/user_guide/groupby.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ only verifies that you've passed a valid mapping.
196196
GroupBy sorting
197197
~~~~~~~~~~~~~~~~~~~~~~~~~
198198

199-
By default the group keys are sorted during the ``groupby`` operation. You may however pass ``sort=False`` for potential speedups:
199+
By default the group keys are sorted during the ``groupby`` operation. You may however pass ``sort=False`` for potential speedups. With ``sort=False`` the order among group-keys follows the order of appearance of the keys in the original dataframe:
200200

201201
.. ipython:: python
202202

doc/source/user_guide/io.rst

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ pass ``format='mixed'``
998998

999999
.. ipython:: python
10001000
1001-
data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n")
1001+
data = StringIO("date\n12 Jan 2000\n2000-01-13\n")
10021002
df = pd.read_csv(data)
10031003
df['date'] = pd.to_datetime(df['date'], format='mixed')
10041004
df
@@ -1007,7 +1007,7 @@ or, if your datetime formats are all ISO8601 (possibly not identically-formatted
10071007

10081008
.. ipython:: python
10091009
1010-
data = io.StringIO("date\n2020-01-01\n2020-01-01 03:00\n")
1010+
data = StringIO("date\n2020-01-01\n2020-01-01 03:00\n")
10111011
df = pd.read_csv(data)
10121012
df['date'] = pd.to_datetime(df['date'], format='ISO8601')
10131013
df
@@ -2167,6 +2167,19 @@ Dates written in nanoseconds need to be read back in nanoseconds:
21672167
dfju = pd.read_json(json, date_unit="ns")
21682168
dfju
21692169
2170+
By setting the ``dtype_backend`` argument you can control the default dtypes used for the resulting DataFrame.
2171+
2172+
.. ipython:: python
2173+
2174+
data = (
2175+
'{"a":{"0":1,"1":3},"b":{"0":2.5,"1":4.5},"c":{"0":true,"1":false},"d":{"0":"a","1":"b"},'
2176+
'"e":{"0":null,"1":6.0},"f":{"0":null,"1":7.5},"g":{"0":null,"1":true},"h":{"0":null,"1":"a"},'
2177+
'"i":{"0":"12-31-2019","1":"12-31-2019"},"j":{"0":null,"1":null}}'
2178+
)
2179+
df = pd.read_json(StringIO(data), dtype_backend="pyarrow")
2180+
df
2181+
df.dtypes
2182+
21702183
.. _io.json_normalize:
21712184

21722185
Normalization

doc/source/whatsnew/v2.0.1.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed regression for subclassed Series when constructing from a dictionary (:issue:`52445`)
1717
- Fixed regression in :meth:`DataFrame.pivot` changing :class:`Index` name of input object (:issue:`52629`)
18+
- Fixed regression in :meth:`DataFrame.resample` raising on a DataFrame with no columns (:issue:`52484`)
1819
- Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`)
1920
- Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`)
2021
- Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`)
@@ -27,7 +28,8 @@ Bug fixes
2728
- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
2829
- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
2930
- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
30-
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on-categorical dtypes (:issue:`49889`)
31+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
32+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
3133
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3234
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3335
- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`)

doc/source/whatsnew/v2.1.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ If installed, we now require:
131131
+-----------------+-----------------+----------+---------+
132132
| Package | Minimum Version | Required | Changed |
133133
+=================+=================+==========+=========+
134+
| numpy | 1.21.6 | X | X |
135+
+-----------------+-----------------+----------+---------+
134136
| mypy (dev) | 1.2 | | X |
135137
+-----------------+-----------------+----------+---------+
136138
| beautifulsoup4 | 4.11.1 | | X |
@@ -316,6 +318,7 @@ Conversion
316318
^^^^^^^^^^
317319
- Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`)
318320
- Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`)
321+
- Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`)
319322
- Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`)
320323
-
321324

pandas/_libs/tslibs/conversion.pyx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ from pandas._libs.tslibs.nattype cimport (
6161
c_nat_strings as nat_strings,
6262
)
6363
from pandas._libs.tslibs.parsing cimport parse_datetime_string
64-
from pandas._libs.tslibs.timestamps cimport _Timestamp
6564
from pandas._libs.tslibs.timezones cimport (
6665
get_utcoffset,
6766
is_utc,
@@ -761,7 +760,7 @@ cdef int64_t parse_pydatetime(
761760
_ts.ensure_reso(NPY_FR_ns)
762761
result = _ts.value
763762
else:
764-
if isinstance(val, _Timestamp):
763+
if isinstance(val, ABCTimestamp):
765764
result = val.as_unit("ns")._value
766765
else:
767766
result = pydatetime_to_dt64(val, dts)

pandas/_libs/tslibs/timestamps.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ from numpy cimport (
2222

2323
cnp.import_array()
2424

25-
from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below
25+
from cpython.datetime cimport ( # alias tzinfo_type bc `tzinfo` is a kwarg below
2626
PyDate_Check,
2727
PyDateTime_Check,
2828
PyDelta_Check,

pandas/_libs/tslibs/util.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ cdef extern from "Python.h":
1010
bint PyComplex_Check(object obj) nogil
1111
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
1212

13+
# TODO(cython3): cimport this, xref GH#49670
1314
# Note that following functions can potentially raise an exception,
1415
# thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
1516
# potentially allocate memory inside in unlikely case of when underlying

pandas/_typing.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@
132132
]
133133
Timezone = Union[str, tzinfo]
134134

135+
ToTimestampHow = Literal["s", "e", "start", "end"]
136+
135137
# NDFrameT is stricter and ensures that the same subclass of NDFrame always is
136138
# used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a
137139
# Series is passed into a function, a Series is always returned and if a DataFrame is
@@ -361,6 +363,9 @@ def closed(self) -> bool:
361363
SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
362364
NaPosition = Literal["first", "last"]
363365

366+
# Arguments for nsmalles and n_largest
367+
NsmallestNlargestKeep = Literal["first", "last", "all"]
368+
364369
# quantile interpolation
365370
QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"]
366371

@@ -372,9 +377,32 @@ def closed(self) -> bool:
372377

373378
# merge
374379
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
380+
MergeValidate = Literal[
381+
"one_to_one",
382+
"1:1",
383+
"one_to_many",
384+
"1:m",
385+
"many_to_one",
386+
"m:1",
387+
"many_to_many",
388+
"m:m",
389+
]
375390

376391
# join
377392
JoinHow = Literal["left", "right", "inner", "outer"]
393+
JoinValidate = Literal[
394+
"one_to_one",
395+
"1:1",
396+
"one_to_many",
397+
"1:m",
398+
"many_to_one",
399+
"m:1",
400+
"many_to_many",
401+
"m:m",
402+
]
403+
404+
# reindex
405+
ReindexMethod = Union[FillnaOptions, Literal["nearest"]]
378406

379407
MatplotlibColor = Union[str, Sequence[float]]
380408
TimeGrouperOrigin = Union[
@@ -400,3 +428,18 @@ def closed(self) -> bool:
400428
"backslashreplace",
401429
"namereplace",
402430
]
431+
432+
# update
433+
UpdateJoin = Literal["left"]
434+
435+
# applymap
436+
NaAction = Literal["ignore"]
437+
438+
# from_dict
439+
FromDictOrient = Literal["columns", "index", "tight"]
440+
441+
# to_gbc
442+
ToGbqIfexist = Literal["fail", "replace", "append"]
443+
444+
# to_stata
445+
ToStataByteorder = Literal[">", "<", "little", "big"]

0 commit comments

Comments
 (0)