Skip to content

Commit 2bb80fc

Browse files
Merge remote-tracking branch 'upstream/master' into typing
2 parents 3d50dcc + 07efdd4 commit 2bb80fc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+579
-646
lines changed

doc/source/getting_started/install.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ Recommended dependencies
218218
``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups.
219219
If installed, must be Version 2.6.2 or higher.
220220

221-
* `bottleneck <https://github.com/kwgoodman/bottleneck>`__: for accelerating certain types of ``nan``
221+
* `bottleneck <https://github.com/pydata/bottleneck>`__: for accelerating certain types of ``nan``
222222
evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed,
223223
must be Version 1.2.1 or higher.
224224

doc/source/user_guide/io.rst

Lines changed: 73 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -5576,7 +5576,7 @@ Performance considerations
55765576
--------------------------
55775577

55785578
This is an informal comparison of various IO methods, using pandas
5579-
0.20.3. Timings are machine dependent and small differences should be
5579+
0.24.2. Timings are machine dependent and small differences should be
55805580
ignored.
55815581

55825582
.. code-block:: ipython
@@ -5597,11 +5597,18 @@ Given the next test set:
55975597

55985598
.. code-block:: python
55995599
5600+
5601+
5602+
import numpy as np
5603+
56005604
import os
56015605
56025606
sz = 1000000
56035607
df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz})
56045608
5609+
sz = 1000000
5610+
np.random.seed(42)
5611+
df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz})
56055612
56065613
def test_sql_write(df):
56075614
if os.path.exists('test.sql'):
@@ -5610,151 +5617,152 @@ Given the next test set:
56105617
df.to_sql(name='test_table', con=sql_db)
56115618
sql_db.close()
56125619
5613-
56145620
def test_sql_read():
56155621
sql_db = sqlite3.connect('test.sql')
56165622
pd.read_sql_query("select * from test_table", sql_db)
56175623
sql_db.close()
56185624
5619-
56205625
def test_hdf_fixed_write(df):
56215626
df.to_hdf('test_fixed.hdf', 'test', mode='w')
56225627
5623-
56245628
def test_hdf_fixed_read():
56255629
pd.read_hdf('test_fixed.hdf', 'test')
56265630
5627-
56285631
def test_hdf_fixed_write_compress(df):
56295632
df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc')
56305633
5631-
56325634
def test_hdf_fixed_read_compress():
56335635
pd.read_hdf('test_fixed_compress.hdf', 'test')
56345636
5635-
56365637
def test_hdf_table_write(df):
56375638
df.to_hdf('test_table.hdf', 'test', mode='w', format='table')
56385639
5639-
56405640
def test_hdf_table_read():
56415641
pd.read_hdf('test_table.hdf', 'test')
56425642
5643-
56445643
def test_hdf_table_write_compress(df):
56455644
df.to_hdf('test_table_compress.hdf', 'test', mode='w',
56465645
complib='blosc', format='table')
56475646
5648-
56495647
def test_hdf_table_read_compress():
56505648
pd.read_hdf('test_table_compress.hdf', 'test')
56515649
5652-
56535650
def test_csv_write(df):
56545651
df.to_csv('test.csv', mode='w')
56555652
5656-
56575653
def test_csv_read():
56585654
pd.read_csv('test.csv', index_col=0)
56595655
5660-
56615656
def test_feather_write(df):
56625657
df.to_feather('test.feather')
56635658
5664-
56655659
def test_feather_read():
56665660
pd.read_feather('test.feather')
56675661
5668-
56695662
def test_pickle_write(df):
56705663
df.to_pickle('test.pkl')
56715664
5672-
56735665
def test_pickle_read():
56745666
pd.read_pickle('test.pkl')
56755667
5676-
56775668
def test_pickle_write_compress(df):
56785669
df.to_pickle('test.pkl.compress', compression='xz')
56795670
5680-
56815671
def test_pickle_read_compress():
56825672
pd.read_pickle('test.pkl.compress', compression='xz')
56835673
5684-
When writing, the top-three functions in terms of speed are are
5685-
``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``.
5674+
def test_parquet_write(df):
5675+
df.to_parquet('test.parquet')
5676+
5677+
def test_parquet_read():
5678+
pd.read_parquet('test.parquet')
5679+
5680+
When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``.
56865681

56875682
.. code-block:: ipython
56885683
5689-
In [14]: %timeit test_sql_write(df)
5690-
2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5684+
In [4]: %timeit test_sql_write(df)
5685+
3.29 s ± 43.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
56915686
5692-
In [15]: %timeit test_hdf_fixed_write(df)
5693-
194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5687+
In [5]: %timeit test_hdf_fixed_write(df)
5688+
19.4 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
56945689
5695-
In [26]: %timeit test_hdf_fixed_write_compress(df)
5696-
119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5690+
In [6]: %timeit test_hdf_fixed_write_compress(df)
5691+
19.6 ms ± 308 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
56975692
5698-
In [16]: %timeit test_hdf_table_write(df)
5699-
623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5693+
In [7]: %timeit test_hdf_table_write(df)
5694+
449 ms ± 5.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57005695
5701-
In [27]: %timeit test_hdf_table_write_compress(df)
5702-
563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5696+
In [8]: %timeit test_hdf_table_write_compress(df)
5697+
448 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57035698
5704-
In [17]: %timeit test_csv_write(df)
5705-
3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5699+
In [9]: %timeit test_csv_write(df)
5700+
3.66 s ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57065701
5707-
In [30]: %timeit test_feather_write(df)
5708-
103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5702+
In [10]: %timeit test_feather_write(df)
5703+
9.75 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57095704
5710-
In [31]: %timeit test_pickle_write(df)
5711-
109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5705+
In [11]: %timeit test_pickle_write(df)
5706+
30.1 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57125707
5713-
In [32]: %timeit test_pickle_write_compress(df)
5714-
3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5708+
In [12]: %timeit test_pickle_write_compress(df)
5709+
4.29 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5710+
5711+
In [13]: %timeit test_parquet_write(df)
5712+
67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57155713
57165714
When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and
57175715
``test_hdf_fixed_read``.
57185716

5717+
57195718
.. code-block:: ipython
57205719
5721-
In [18]: %timeit test_sql_read()
5722-
1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5720+
In [14]: %timeit test_sql_read()
5721+
1.77 s ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5722+
5723+
In [15]: %timeit test_hdf_fixed_read()
5724+
19.4 ms ± 436 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5725+
5726+
In [16]: %timeit test_hdf_fixed_read_compress()
5727+
19.5 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57235728
5724-
In [19]: %timeit test_hdf_fixed_read()
5725-
14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5729+
In [17]: %timeit test_hdf_table_read()
5730+
38.6 ms ± 857 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57265731
5727-
In [28]: %timeit test_hdf_fixed_read_compress()
5728-
23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5732+
In [18]: %timeit test_hdf_table_read_compress()
5733+
38.8 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
57295734
5730-
In [20]: %timeit test_hdf_table_read()
5731-
35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5735+
In [19]: %timeit test_csv_read()
5736+
452 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57325737
5733-
In [29]: %timeit test_hdf_table_read_compress()
5734-
42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5738+
In [20]: %timeit test_feather_read()
5739+
12.4 ms ± 99.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57355740
5736-
In [22]: %timeit test_csv_read()
5737-
516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5741+
In [21]: %timeit test_pickle_read()
5742+
18.4 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57385743
5739-
In [33]: %timeit test_feather_read()
5740-
4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5744+
In [22]: %timeit test_pickle_read_compress()
5745+
915 ms ± 7.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57415746
5742-
In [34]: %timeit test_pickle_read()
5743-
6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5747+
In [23]: %timeit test_parquet_read()
5748+
24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57445749
5745-
In [35]: %timeit test_pickle_read_compress()
5746-
588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57475750
5751+
For this test case ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk.
57485752
Space on disk (in bytes)
57495753

57505754
.. code-block:: none
57515755
5752-
34816000 Aug 21 18:00 test.sql
5753-
24009240 Aug 21 18:00 test_fixed.hdf
5754-
7919610 Aug 21 18:00 test_fixed_compress.hdf
5755-
24458892 Aug 21 18:00 test_table.hdf
5756-
8657116 Aug 21 18:00 test_table_compress.hdf
5757-
28520770 Aug 21 18:00 test.csv
5758-
16000248 Aug 21 18:00 test.feather
5759-
16000848 Aug 21 18:00 test.pkl
5760-
7554108 Aug 21 18:00 test.pkl.compress
5756+
29519500 Oct 10 06:45 test.csv
5757+
16000248 Oct 10 06:45 test.feather
5758+
8281983 Oct 10 06:49 test.parquet
5759+
16000857 Oct 10 06:47 test.pkl
5760+
7552144 Oct 10 06:48 test.pkl.compress
5761+
34816000 Oct 10 06:42 test.sql
5762+
24009288 Oct 10 06:43 test_fixed.hdf
5763+
24009288 Oct 10 06:43 test_fixed_compress.hdf
5764+
24458940 Oct 10 06:44 test_table.hdf
5765+
24458940 Oct 10 06:44 test_table_compress.hdf
5766+
5767+
5768+

doc/source/whatsnew/v0.21.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Highlights include:
2020
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here <whatsnew_0210.enhancements.parquet>`.
2121
- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
2222
categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
23-
- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`.
23+
- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <https://bottleneck.readthedocs.io>`__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`.
2424
- Compatibility fixes for pypy, see :ref:`here <whatsnew_0210.pypy>`.
2525
- Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here <whatsnew_0210.enhancements.drop_api>`.
2626
- Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here <whatsnew_0210.enhancements.infer_objects>`) and ``GroupBy.pipe`` (see :ref:`here <whatsnew_0210.enhancements.GroupBy_pipe>`).
@@ -390,7 +390,7 @@ Sum/Prod of all-NaN or empty Series/DataFrames is now consistently NaN
390390

391391

392392
The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on
393-
whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`).
393+
whether `bottleneck <https://bottleneck.readthedocs.io>`__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`).
394394

395395
Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs <missing_data.numeric_sum>`.
396396

doc/source/whatsnew/v0.8.1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Performance improvements
2929
~~~~~~~~~~~~~~~~~~~~~~~~
3030

3131
- Improved implementation of rolling min and max (thanks to `Bottleneck
32-
<http://berkeleyanalytics.com/bottleneck/>`__ !)
32+
<https://bottleneck.readthedocs.io>`__ !)
3333
- Add accelerated ``'median'`` GroupBy option (:issue:`1358`)
3434
- Significantly improve the performance of parsing ISO8601-format date
3535
strings with ``DatetimeIndex`` or ``to_datetime`` (:issue:`1571`)

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ Plotting
414414
- Bug in the ``xticks`` argument being ignored for :meth:`DataFrame.plot.bar` (:issue:`14119`)
415415
- :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`)
416416
- :meth:`DataFrame.plot` now allow a ``backend`` keyword arugment to allow changing between backends in one session (:issue:`28619`).
417+
- Bug in color validation incorrectly raising for non-color styles (:issue:`29122`).
417418

418419
Groupby/resample/rolling
419420
^^^^^^^^^^^^^^^^^^^^^^^^

pandas/_libs/groupby.pyx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -753,8 +753,7 @@ def group_quantile(ndarray[float64_t] out,
753753
assert values.shape[0] == N
754754

755755
if not (0 <= q <= 1):
756-
raise ValueError("'q' must be between 0 and 1. Got"
757-
" '{}' instead".format(q))
756+
raise ValueError(f"'q' must be between 0 and 1. Got '{q}' instead")
758757

759758
inter_methods = {
760759
'linear': INTERPOLATION_LINEAR,

pandas/_libs/hashing.pyx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
4747
k = <bytes>key.encode(encoding)
4848
kb = <uint8_t *>k
4949
if len(k) != 16:
50-
raise ValueError("key should be a 16-byte string encoded, "
51-
"got {key} (len {klen})".format(key=k, klen=len(k)))
50+
raise ValueError(f"key should be a 16-byte string encoded, "
51+
f"got {k} (len {len(k)})")
5252

5353
n = len(arr)
5454

@@ -67,9 +67,9 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
6767
data = <bytes>str(val).encode(encoding)
6868

6969
else:
70-
raise TypeError("{val} of type {typ} is not a valid type "
71-
"for hashing, must be string or null"
72-
.format(val=val, typ=type(val)))
70+
raise TypeError(f"{val} of type {type(val)} is not a valid type "
71+
f"for hashing, must be string or null"
72+
)
7373

7474
l = len(data)
7575
lens[i] = l

pandas/_libs/index.pyx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ cdef class IndexEngine:
109109
Py_ssize_t loc
110110

111111
if is_definitely_invalid_key(val):
112-
raise TypeError("'{val}' is an invalid key".format(val=val))
112+
raise TypeError(f"'{val}' is an invalid key")
113113

114114
if self.over_size_threshold and self.is_monotonic_increasing:
115115
if not self.is_unique:
@@ -556,8 +556,8 @@ cpdef convert_scalar(ndarray arr, object value):
556556
pass
557557
elif value is None or value != value:
558558
return np.datetime64("NaT", "ns")
559-
raise ValueError("cannot set a Timestamp with a non-timestamp {typ}"
560-
.format(typ=type(value).__name__))
559+
raise ValueError(f"cannot set a Timestamp with a non-timestamp "
560+
f"{type(value).__name__}")
561561

562562
elif arr.descr.type_num == NPY_TIMEDELTA:
563563
if util.is_array(value):
@@ -573,8 +573,8 @@ cpdef convert_scalar(ndarray arr, object value):
573573
pass
574574
elif value is None or value != value:
575575
return np.timedelta64("NaT", "ns")
576-
raise ValueError("cannot set a Timedelta with a non-timedelta {typ}"
577-
.format(typ=type(value).__name__))
576+
raise ValueError(f"cannot set a Timedelta with a non-timedelta "
577+
f"{type(value).__name__}")
578578

579579
if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and
580580
not issubclass(arr.dtype.type, np.bool_)):
@@ -677,7 +677,7 @@ cdef class BaseMultiIndexCodesEngine:
677677
# Index._get_fill_indexer), sort (integer representations of) keys:
678678
order = np.argsort(lab_ints)
679679
lab_ints = lab_ints[order]
680-
indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
680+
indexer = (getattr(self._base, f'get_{method}_indexer')
681681
(self, lab_ints, limit=limit))
682682
indexer = indexer[order]
683683
else:
@@ -687,7 +687,7 @@ cdef class BaseMultiIndexCodesEngine:
687687

688688
def get_loc(self, object key):
689689
if is_definitely_invalid_key(key):
690-
raise TypeError("'{key}' is an invalid key".format(key=key))
690+
raise TypeError(f"'{key}' is an invalid key")
691691
if not isinstance(key, tuple):
692692
raise KeyError(key)
693693
try:

pandas/_libs/internals.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ cdef class BlockPlacement:
6161
else:
6262
v = self._as_array
6363

64-
return '%s(%r)' % (self.__class__.__name__, v)
64+
return f'{self.__class__.__name__}({v})'
6565

6666
def __repr__(self) -> str:
6767
return str(self)

0 commit comments

Comments
 (0)